import pandas as pd import numpy as np import nltk, string from nltk.corpus import gutenberg from collections import Counter from nltk.tokenize import word_tokenize import random, string def preprocess_new(text): text = ' '.join(gutenberg.raw('blake-poems.txt').split()) # Remove punctuation except for commas punctuation_to_remove = string.punctuation.replace(',', '').replace('\n', '') # Keep commas translator = str.maketrans('', '', punctuation_to_remove) text = text.translate(translator) #text = text.translate(str.maketrans('', '', string.punctuation)) # Tokenize and lower case tokens = word_tokenize(text) tokens = [word.lower() for word in tokens] return tokens def create_ngrams(tokens, n): n_gram_tokens = [] for i in range(len(tokens)-n): n_gram_tokens.append(tuple(tokens[i:i+n])) return n_gram_tokens def probability_helper(sample,n): """ sample: text sample n: n-gram size return: dataframe with probability """ #get ngrams ngrams_sample = create_ngrams(sample,n) #get frequency ngram_frequency = Counter([tuple(ngram) for ngram in ngrams_sample]) #ger probability df = pd.DataFrame.from_dict(ngram_frequency, orient='index').reset_index() df.columns = ['sequence', 'count'] #convert first column into 2 columns where first column has n-1 words, the second column has nth word df['nth_word'] = df['sequence'].apply(lambda x: x[-1]) def get_sequence(tuple): x = '' for i in range(len(tuple)-1): x+=(tuple[i]) x+=',' x = x[:-1] x = x.replace(","," ") return x df['sequence'] = df['sequence'].apply(lambda x: get_sequence(x)) #get ids for sequences and predictions df_sorted = df.sort_values(by='sequence') df_sorted['sequence_id'] = range(1, len(df_sorted) + 1) df_new = df_sorted df_sorted = df_new.sort_values(by='nth_word') df_sorted['prediction_id'] = range(1, len(df_sorted) + 1) return df, df_sorted def get_probability(sample,n,type = None): if type==None: df, df_sorted = probability_helper(sample,n) totals = df.groupby('sequence')['count'].sum().reset_index().rename(columns={'count':'total'}) df_sorted = df_sorted.merge(totals, how = 'left', on = 'sequence') df_sorted['probability'] = df_sorted['count']/df_sorted['total'] elif type =="smooth": df, df_sorted = probability_helper(sample,n) v = df_sorted['prediction_id'].max() totals = df.groupby('sequence')['count'].sum().reset_index().rename(columns={'count':'total'}) df_sorted = df_sorted.merge(totals, how = 'left', on = 'sequence') df_sorted['probability'] = (df_sorted['count']+1)/(df_sorted['total'] + v) return df_sorted def predict(data, sequence): """this function generates predictions based on probabilities seen in the dataset""" try: subset = data[data['sequence']==sequence.strip()] result = subset.iloc[subset['probability'].argmax()]['nth_word'] #return the word with max probability #print("sequence detected") return result except: result = random.choice(data['nth_word'].unique()) #print("sequence not detected") return result def generate_sentence(data, sequence, n,len ): """ data: result of get_probability() sequence: should be n-1 words together len: number of predictions to be made """ sentence = sequence sentence = sentence.strip() for i in range(len): n_minus_1_sequence = ' '.join(sentence.split(" ")[-n+1:]) #print(f'sequence number {i+1}: {n_minus_1_sequence}') next_word = predict(data, n_minus_1_sequence) if next_word not in (',','\n'): sentence = sentence + ' ' + next_word else: sentence+=next_word return sentence ''' files = gutenberg.fileids() text = [gutenberg.raw(fileid) for fileid in gutenberg.fileids()] file_text = dict(zip(files, text)) for key, value in file_text.items(): file_text[key] = preprocess_new(value)'''