Text Duplicate filter in Python 3.7

up vote
5
down vote

favorite

I wrote this program just to remove duplicates from wordlist used for brute-force, but it can be used for every txt file i guess.
It is a pretty long program for what it does, and I'm sure there is a way I can make it easier and better for everyone to look at.
I'm still a beginner in python.

def wordcount_initial():  # count the numbre of words in the input file

    global num_words_initial



    with open(file_initial, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)





def dupfilter():

    content = open(file_initial, "r").readlines()

    content_set = set(content)  # filter duplicates

    cleandata = open(file_final, "w+")  # write the text filtered in the new   file



    for line in content_set:

        cleandata.write(line)





def wordcount_final():  # count number of words in the new file

    global num_words_final



    with open(file_final, 'r') as f:

        for line in f:

            words = line.split()

            num_words_final += len(words)





if __name__ == '__main__':

    num_words = 0

    num_words_initial = 0

    num_words_final = 0



    ready = False

    while not ready:

        file_initial = input("What is the name of the text?")



        file_final = input("How do you want to name the filted file?")



        if file_initial and file_final != "":



            wordcount_initial()

            ready = True



    dupfilter()



    wordcount_final()



    print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))

    input("nPress <ENTER> to quit the program.")

edited Nov 27 at 12:25

200_success

127k15148412

asked Nov 27 at 9:46

Thewizy

426

add a comment |

up vote
5
down vote

favorite

def wordcount_initial():  # count the numbre of words in the input file

    global num_words_initial



    with open(file_initial, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)





def dupfilter():

    content = open(file_initial, "r").readlines()

    content_set = set(content)  # filter duplicates

    cleandata = open(file_final, "w+")  # write the text filtered in the new   file



    for line in content_set:

        cleandata.write(line)





def wordcount_final():  # count number of words in the new file

    global num_words_final



    with open(file_final, 'r') as f:

        for line in f:

            words = line.split()

            num_words_final += len(words)





if __name__ == '__main__':

    num_words = 0

    num_words_initial = 0

    num_words_final = 0



    ready = False

    while not ready:

        file_initial = input("What is the name of the text?")



        file_final = input("How do you want to name the filted file?")



        if file_initial and file_final != "":



            wordcount_initial()

            ready = True



    dupfilter()



    wordcount_final()



    print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))

    input("nPress <ENTER> to quit the program.")

edited Nov 27 at 12:25

200_success

127k15148412

asked Nov 27 at 9:46

Thewizy

426

add a comment |

up vote
5
down vote

favorite

def wordcount_initial():  # count the numbre of words in the input file

    global num_words_initial



    with open(file_initial, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)





def dupfilter():

    content = open(file_initial, "r").readlines()

    content_set = set(content)  # filter duplicates

    cleandata = open(file_final, "w+")  # write the text filtered in the new   file



    for line in content_set:

        cleandata.write(line)





def wordcount_final():  # count number of words in the new file

    global num_words_final



    with open(file_final, 'r') as f:

        for line in f:

            words = line.split()

            num_words_final += len(words)





if __name__ == '__main__':

    num_words = 0

    num_words_initial = 0

    num_words_final = 0



    ready = False

    while not ready:

        file_initial = input("What is the name of the text?")



        file_final = input("How do you want to name the filted file?")



        if file_initial and file_final != "":



            wordcount_initial()

            ready = True



    dupfilter()



    wordcount_final()



    print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))

    input("nPress <ENTER> to quit the program.")

edited Nov 27 at 12:25

200_success

127k15148412

asked Nov 27 at 9:46

Thewizy

426

def wordcount_initial():  # count the numbre of words in the input file

    global num_words_initial



    with open(file_initial, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)





def dupfilter():

    content = open(file_initial, "r").readlines()

    content_set = set(content)  # filter duplicates

    cleandata = open(file_final, "w+")  # write the text filtered in the new   file



    for line in content_set:

        cleandata.write(line)





def wordcount_final():  # count number of words in the new file

    global num_words_final



    with open(file_final, 'r') as f:

        for line in f:

            words = line.split()

            num_words_final += len(words)





if __name__ == '__main__':

    num_words = 0

    num_words_initial = 0

    num_words_final = 0



    ready = False

    while not ready:

        file_initial = input("What is the name of the text?")



        file_final = input("How do you want to name the filted file?")



        if file_initial and file_final != "":



            wordcount_initial()

            ready = True



    dupfilter()



    wordcount_final()



    print("Number of duplicates filtered:" + str(num_words_initial - num_words_final))

    input("nPress <ENTER> to quit the program.")

python beginner python-3.x file

edited Nov 27 at 12:25

200_success

127k15148412

asked Nov 27 at 9:46

Thewizy

426

edited Nov 27 at 12:25

200_success

127k15148412

asked Nov 27 at 9:46

Thewizy

426

edited Nov 27 at 12:25

200_success

127k15148412

edited Nov 27 at 12:25

200_success

127k15148412

edited Nov 27 at 12:25

200_success

127k15148412

asked Nov 27 at 9:46

Thewizy

426

asked Nov 27 at 9:46

Thewizy

426

asked Nov 27 at 9:46

Thewizy

426

add a comment |

2 Answers
2

active

oldest

votes

up vote
6
down vote

accepted

You should not use global variables unless you really need to. Instead, pass relevant parameters as arguments to the functions and return the results. This makes them actually reusable. Currently you have two functions to count the initial and final number of words, when you could just have a word_count function:

def wordcount(file_name):

    """count the number of words in the file"""

    with open(file_name) as f:

        return sum(len(line.split()) for line in f) 



def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(set(in_file.readlines()))



if __name__ == '__main__':

    while True:

        file_initial = input("What is the name of the text?")

        file_final = input("How do you want to name the filtered file?")

        if file_initial and file_final and file_initial != file_final:

            break



    num_words_initial = wordcount(file_initial)

    dupfilter(file_initial, file_final)

    num_words_final = wordcount(file_final)



    print("Number of duplicates filtered:", num_words_initial - num_words_final)

    input("nPress <ENTER> to quit the program.")

I also used sum with a generator expression to simplify the wordcount function, used with to ensure that files are properly closed.
In addition, I replaced the while not ready loop with a while True loop and an explicit break. This is much more common in Python, especially for user input.

Note that if file_initial and file_final != "" is only incidentally the same as if file_initial != "" and file_final != "" because non-empty strings are truthy. This is why it is also equivalent to if file_initial and file_final. But for example x and y == 3 is not the same as x == 3 and y == 3.

You also don't need to call str on things to be printed, print does that for you if necessary.

Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:

from itertools import filterfalse



def unique_everseen(iterable, key=None):

    "List unique elements, preserving order. Remember all elements ever seen."

    # unique_everseen('AAAABBBCCDAABBB') --> A B C D

    # unique_everseen('ABBCcAD', str.lower) --> A B C D

    seen = set()

    seen_add = seen.add

    if key is None:

        for element in filterfalse(seen.__contains__, iterable):

            seen_add(element)

            yield element

    else:

        for element in iterable:

            k = key(element)

            if k not in seen:

                seen_add(k)

                yield element

def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(unique_everseen(in_file))

edited Nov 27 at 13:34

answered Nov 27 at 13:17

Graipher

22.5k53384

1

Nice section on itertools instead of set!
– Julien Rousé
Nov 27 at 13:43

Thanks a lot your help is really precious!
– Thewizy
Nov 27 at 15:07

1

I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
Nov 27 at 15:16

add a comment |

up vote
4
down vote

Thanks for sharing your code!

I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.

Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)

def wordcount_initial(input_file):

    """Return the number of words in the file given in parameter



       :param input_file: A file name

       :type input_file: string 

       :return: The number of words in the file `input_file`

       :rtype: int

    """

    num_words_initial = 0

    with open(input_file, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)

    return num_words_initial

There are a few changes here:

Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)

Another remark, you name your function dupfilter, but every other name in your code has the format first_last which is a bit inconsistant. Also, don't try to gain a few letters when typing, write duplicatefilter or better (in my opinion) filter_duplicate.

Naming is always a bit subjective, use your best judgement.

And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.

On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.

Nice job!

edited Nov 27 at 14:13

answered Nov 27 at 13:37

Julien Rousé

679517

Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
Nov 27 at 15:08

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208508%2ftext-duplicate-filter-in-python-3-7%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

2 Answers
2

active

oldest

votes

2 Answers
2

active

oldest

votes

up vote
6
down vote

accepted

def wordcount(file_name):

    """count the number of words in the file"""

    with open(file_name) as f:

        return sum(len(line.split()) for line in f) 



def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(set(in_file.readlines()))



if __name__ == '__main__':

    while True:

        file_initial = input("What is the name of the text?")

        file_final = input("How do you want to name the filtered file?")

        if file_initial and file_final and file_initial != file_final:

            break



    num_words_initial = wordcount(file_initial)

    dupfilter(file_initial, file_final)

    num_words_final = wordcount(file_final)



    print("Number of duplicates filtered:", num_words_initial - num_words_final)

    input("nPress <ENTER> to quit the program.")

You also don't need to call str on things to be printed, print does that for you if necessary.

Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:

from itertools import filterfalse



def unique_everseen(iterable, key=None):

    "List unique elements, preserving order. Remember all elements ever seen."

    # unique_everseen('AAAABBBCCDAABBB') --> A B C D

    # unique_everseen('ABBCcAD', str.lower) --> A B C D

    seen = set()

    seen_add = seen.add

    if key is None:

        for element in filterfalse(seen.__contains__, iterable):

            seen_add(element)

            yield element

    else:

        for element in iterable:

            k = key(element)

            if k not in seen:

                seen_add(k)

                yield element

def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(unique_everseen(in_file))

edited Nov 27 at 13:34

answered Nov 27 at 13:17

Graipher

22.5k53384

1

Nice section on itertools instead of set!
– Julien Rousé
Nov 27 at 13:43

Thanks a lot your help is really precious!
– Thewizy
Nov 27 at 15:07

1

I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
Nov 27 at 15:16

add a comment |

up vote
6
down vote

accepted

def wordcount(file_name):

    """count the number of words in the file"""

    with open(file_name) as f:

        return sum(len(line.split()) for line in f) 



def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(set(in_file.readlines()))



if __name__ == '__main__':

    while True:

        file_initial = input("What is the name of the text?")

        file_final = input("How do you want to name the filtered file?")

        if file_initial and file_final and file_initial != file_final:

            break



    num_words_initial = wordcount(file_initial)

    dupfilter(file_initial, file_final)

    num_words_final = wordcount(file_final)



    print("Number of duplicates filtered:", num_words_initial - num_words_final)

    input("nPress <ENTER> to quit the program.")

You also don't need to call str on things to be printed, print does that for you if necessary.

Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:

from itertools import filterfalse



def unique_everseen(iterable, key=None):

    "List unique elements, preserving order. Remember all elements ever seen."

    # unique_everseen('AAAABBBCCDAABBB') --> A B C D

    # unique_everseen('ABBCcAD', str.lower) --> A B C D

    seen = set()

    seen_add = seen.add

    if key is None:

        for element in filterfalse(seen.__contains__, iterable):

            seen_add(element)

            yield element

    else:

        for element in iterable:

            k = key(element)

            if k not in seen:

                seen_add(k)

                yield element

def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(unique_everseen(in_file))

edited Nov 27 at 13:34

answered Nov 27 at 13:17

Graipher

22.5k53384

1

Nice section on itertools instead of set!
– Julien Rousé
Nov 27 at 13:43

Thanks a lot your help is really precious!
– Thewizy
Nov 27 at 15:07

1

I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
Nov 27 at 15:16

add a comment |

up vote
6
down vote

accepted

def wordcount(file_name):

    """count the number of words in the file"""

    with open(file_name) as f:

        return sum(len(line.split()) for line in f) 



def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(set(in_file.readlines()))



if __name__ == '__main__':

    while True:

        file_initial = input("What is the name of the text?")

        file_final = input("How do you want to name the filtered file?")

        if file_initial and file_final and file_initial != file_final:

            break



    num_words_initial = wordcount(file_initial)

    dupfilter(file_initial, file_final)

    num_words_final = wordcount(file_final)



    print("Number of duplicates filtered:", num_words_initial - num_words_final)

    input("nPress <ENTER> to quit the program.")

You also don't need to call str on things to be printed, print does that for you if necessary.

Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:

from itertools import filterfalse



def unique_everseen(iterable, key=None):

    "List unique elements, preserving order. Remember all elements ever seen."

    # unique_everseen('AAAABBBCCDAABBB') --> A B C D

    # unique_everseen('ABBCcAD', str.lower) --> A B C D

    seen = set()

    seen_add = seen.add

    if key is None:

        for element in filterfalse(seen.__contains__, iterable):

            seen_add(element)

            yield element

    else:

        for element in iterable:

            k = key(element)

            if k not in seen:

                seen_add(k)

                yield element

def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(unique_everseen(in_file))

edited Nov 27 at 13:34

answered Nov 27 at 13:17

Graipher

22.5k53384

def wordcount(file_name):

    """count the number of words in the file"""

    with open(file_name) as f:

        return sum(len(line.split()) for line in f) 



def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(set(in_file.readlines()))



if __name__ == '__main__':

    while True:

        file_initial = input("What is the name of the text?")

        file_final = input("How do you want to name the filtered file?")

        if file_initial and file_final and file_initial != file_final:

            break



    num_words_initial = wordcount(file_initial)

    dupfilter(file_initial, file_final)

    num_words_final = wordcount(file_final)



    print("Number of duplicates filtered:", num_words_initial - num_words_final)

    input("nPress <ENTER> to quit the program.")

You also don't need to call str on things to be printed, print does that for you if necessary.

Note that using set does not guarantee the same order as the original file, for that you would want to use the itertools recipe unique_everseen:

from itertools import filterfalse



def unique_everseen(iterable, key=None):

    "List unique elements, preserving order. Remember all elements ever seen."

    # unique_everseen('AAAABBBCCDAABBB') --> A B C D

    # unique_everseen('ABBCcAD', str.lower) --> A B C D

    seen = set()

    seen_add = seen.add

    if key is None:

        for element in filterfalse(seen.__contains__, iterable):

            seen_add(element)

            yield element

    else:

        for element in iterable:

            k = key(element)

            if k not in seen:

                seen_add(k)

                yield element

def dupfilter(file_initial, file_final):

    with open(file_initial) as in_file, open(file_final, "w") as out_file:

        out_file.writelines(unique_everseen(in_file))

edited Nov 27 at 13:34

answered Nov 27 at 13:17

Graipher

22.5k53384

edited Nov 27 at 13:34

answered Nov 27 at 13:17

Graipher

22.5k53384

answered Nov 27 at 13:17

Graipher

22.5k53384

answered Nov 27 at 13:17

Graipher

22.5k53384

1

Nice section on itertools instead of set!
– Julien Rousé
Nov 27 at 13:43

Thanks a lot your help is really precious!
– Thewizy
Nov 27 at 15:07

1

I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
Nov 27 at 15:16

add a comment |

1

Nice section on itertools instead of set!
– Julien Rousé
Nov 27 at 13:43

Thanks a lot your help is really precious!
– Thewizy
Nov 27 at 15:07

1

I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
Nov 27 at 15:16

Nice section on itertools instead of set!
– Julien Rousé
Nov 27 at 13:43

Thanks a lot your help is really precious!
– Thewizy
Nov 27 at 15:07

I didn't know that set could not give the same order when filtering the duplicate thanks!
– Thewizy
Nov 27 at 15:16

add a comment |

up vote
4
down vote

Thanks for sharing your code!

I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.

Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)

def wordcount_initial(input_file):

    """Return the number of words in the file given in parameter



       :param input_file: A file name

       :type input_file: string 

       :return: The number of words in the file `input_file`

       :rtype: int

    """

    num_words_initial = 0

    with open(input_file, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)

    return num_words_initial

There are a few changes here:

Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)

Naming is always a bit subjective, use your best judgement.

And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.

On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.

Nice job!

edited Nov 27 at 14:13

answered Nov 27 at 13:37

Julien Rousé

679517

Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
Nov 27 at 15:08

add a comment |

up vote
4
down vote

Thanks for sharing your code!

I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.

Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)

def wordcount_initial(input_file):

    """Return the number of words in the file given in parameter



       :param input_file: A file name

       :type input_file: string 

       :return: The number of words in the file `input_file`

       :rtype: int

    """

    num_words_initial = 0

    with open(input_file, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)

    return num_words_initial

There are a few changes here:

Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)

Naming is always a bit subjective, use your best judgement.

And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.

On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.

Nice job!

edited Nov 27 at 14:13

answered Nov 27 at 13:37

Julien Rousé

679517

Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
Nov 27 at 15:08

add a comment |

up vote
4
down vote

Thanks for sharing your code!

I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.

Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)

def wordcount_initial(input_file):

    """Return the number of words in the file given in parameter



       :param input_file: A file name

       :type input_file: string 

       :return: The number of words in the file `input_file`

       :rtype: int

    """

    num_words_initial = 0

    with open(input_file, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)

    return num_words_initial

There are a few changes here:

Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)

Naming is always a bit subjective, use your best judgement.

And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.

On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.

Nice job!

edited Nov 27 at 14:13

answered Nov 27 at 13:37

Julien Rousé

679517

Thanks for sharing your code!

I'll begin by pointing out you don't need global variable, and you should avoid them when possible. Very often it only does make your code less readable and more fragile.

Here your function wordcount_initial could be rewritten as: (same idea for wordcount_final)

def wordcount_initial(input_file):

    """Return the number of words in the file given in parameter



       :param input_file: A file name

       :type input_file: string 

       :return: The number of words in the file `input_file`

       :rtype: int

    """

    num_words_initial = 0

    with open(input_file, 'r') as f:

        for line in f:

            words = line.split()

            num_words_initial += len(words)

    return num_words_initial

There are a few changes here:

Removed num_words_initial as a global and return it's value at the end of the function. It's much more clean to return value that way when you can. It also help when you want to test your functions.

Gives input_file as a parameter of your function instead of relying on another global. It makes your function more reusable.

And I transformed your comment in docstring that can be used to generate documentation for your code. See ReStrusctured and sphinx for more information. Ideally every function should be documented (and every module too)

Naming is always a bit subjective, use your best judgement.

And finally in your __main__ you could have put the logic for initialising the name of the file into another function but that's not very important.

On a more positive note I like how you laid out your code, it is well spaced, most of the name are clear when you read them and you have comment which is often important.

Nice job!

edited Nov 27 at 14:13

answered Nov 27 at 13:37

Julien Rousé

679517

edited Nov 27 at 14:13

answered Nov 27 at 13:37

Julien Rousé

679517

answered Nov 27 at 13:37

Julien Rousé

679517

answered Nov 27 at 13:37

Julien Rousé

679517

Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
Nov 27 at 15:08

add a comment |

Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
Nov 27 at 15:08

Thank a lot for your help i will try to apply all thoses advice to my futur programs!
– Thewizy
Nov 27 at 15:08

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Vrftjkry