File size: 4,149 Bytes
16da076
 
 
 
 
 
 
 
 
 
 
73cc811
16da076
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73cc811
16da076
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import pandas as pd
import numpy as np
import nltk, string
from nltk.corpus import gutenberg
from collections import Counter
from nltk.tokenize import word_tokenize
import random, string

def preprocess_new(text):
    text = ' '.join(gutenberg.raw('blake-poems.txt').split())
    # Remove punctuation except for commas
    punctuation_to_remove = string.punctuation.replace(',', '').replace('\n', '')  # Keep commas
    translator = str.maketrans('', '', punctuation_to_remove)
    text = text.translate(translator)
    #text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize and lower case
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    return tokens

def create_ngrams(tokens, n):
    n_gram_tokens = []
    for i in range(len(tokens)-n):
        n_gram_tokens.append(tuple(tokens[i:i+n]))
    return n_gram_tokens

def probability_helper(sample,n):
    """
    sample: text sample
    n: n-gram size
    return: dataframe with probability
    """
    #get ngrams
    ngrams_sample = create_ngrams(sample,n)

    #get frequency
    ngram_frequency = Counter([tuple(ngram) for ngram in ngrams_sample])

    #ger probability
    df = pd.DataFrame.from_dict(ngram_frequency, orient='index').reset_index()
    df.columns = ['sequence',  'count']

    #convert first column into 2 columns where first column has n-1 words, the second column has nth word
    df['nth_word'] = df['sequence'].apply(lambda x: x[-1])

    def get_sequence(tuple):
        x = ''
        for i in range(len(tuple)-1):
            x+=(tuple[i])
            x+=','
        x = x[:-1]
        x = x.replace(","," ")
        return x

    df['sequence'] = df['sequence'].apply(lambda x: get_sequence(x))

    #get ids for sequences and predictions
    df_sorted = df.sort_values(by='sequence')
    df_sorted['sequence_id'] = range(1, len(df_sorted) + 1)
    df_new = df_sorted
    df_sorted = df_new.sort_values(by='nth_word')
    df_sorted['prediction_id'] = range(1, len(df_sorted) + 1)

    return df, df_sorted

def get_probability(sample,n,type = None):
    if type==None:

        df, df_sorted = probability_helper(sample,n)
        totals = df.groupby('sequence')['count'].sum().reset_index().rename(columns={'count':'total'})
        df_sorted = df_sorted.merge(totals, how = 'left', on = 'sequence')
        df_sorted['probability'] = df_sorted['count']/df_sorted['total']
    elif type =="smooth":
        df, df_sorted = probability_helper(sample,n)
        v = df_sorted['prediction_id'].max()
        
        totals = df.groupby('sequence')['count'].sum().reset_index().rename(columns={'count':'total'})
        df_sorted = df_sorted.merge(totals, how = 'left', on = 'sequence')
        df_sorted['probability'] = (df_sorted['count']+1)/(df_sorted['total'] + v)

    return df_sorted

def predict(data, sequence):
    """this function generates predictions based on probabilities seen in the dataset"""
    try:
        subset = data[data['sequence']==sequence.strip()]
        result = subset.iloc[subset['probability'].argmax()]['nth_word'] #return the word with max probability
        #print("sequence detected")
        return result
    except:
        result = random.choice(data['nth_word'].unique())
        #print("sequence not detected")
        return result

def generate_sentence(data, sequence, n,len ):
    """
    data: result of get_probability()
    sequence: should be n-1 words together
    len: number of predictions to be made
    """
    sentence = sequence
    sentence = sentence.strip()
    for i in range(len):
        n_minus_1_sequence = ' '.join(sentence.split(" ")[-n+1:])
        #print(f'sequence number {i+1}: {n_minus_1_sequence}')
        next_word = predict(data, n_minus_1_sequence)
        if next_word not in (',','\n'):
            sentence = sentence + ' ' + next_word
        else:
            sentence+=next_word
    return sentence


'''
files = gutenberg.fileids()
text = [gutenberg.raw(fileid) for fileid in gutenberg.fileids()]
file_text = dict(zip(files, text))

for key, value in file_text.items():
    file_text[key] = preprocess_new(value)'''