Categories
Uncategorized

Key words in directory – Python

This script will read all the text files in a directory and output a list of all the words and how many times they apear.

from re import findall,sub
from os import listdir
from collections import Counter

# path to folder containg all the files
str_dir_folder = 'theTextFiles/'

# name and location of output file
str_output_file = 'word_count.txt'

# the list where all the words will be placed
list_file_data = []

# loop through all the files in the directory
for str_each_file in listdir(str_dir_folder):
    if str_each_file.endswith('.txt'):
        
        # open file and read
        with open(str_dir_folder+str_each_file,'r') as file_r_data:
            str_file_data = file_r_data.read()
        
        # add data to list
        list_file_data.append(str_file_data)

# clean all the data so that we don't have all the nasty bits in it
str_full_data = ' '.join(list_file_data)
str_clean1 = sub('\t','',str_full_data)
str_clean_data = sub('\n',' ',str_clean1)

# find all the words and put them into a list
list_all_words = findall('\w+',str_clean_data)

# dictionary with all the times a word has been used
dict_word_count = Counter(list_all_words)

# put data in a list, ready for output file
list_output_data = []
for str_each_item in dict_word_count:
    str_word = str_each_item
    int_freq = dict_word_count[str_each_item]

    str_out_line = '"%s",%d' % (str_word,int_freq)
    
    # populates output list
    list_output_data.append(str_out_line)

# create output file, write data, close it
file_w_output = open(str_output_file,'w')
file_w_output.write('\n'.join(list_output_data))
file_w_output.close()

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s