File size: 26,489 Bytes
086b7b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc2112a
086b7b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
import numpy as np
import nltk
import re
import spacy
import pandas as pd
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.util import ngrams

import transformers
from transformers import pipeline
from transformers import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings('ignore')

# Remove punctuation
def preprocess(sentences, n):
    new_sentences = []
    for sentence in sentences:
        sentence = sentence[0]
        sentence = sentence.lower()
        if n == 1:
            sentence = nltk.RegexpTokenizer(r'\w+').tokenize(sentence)
            new_sentences.append(sentence)
        else :
            sentence = re.sub(r'[^\w\s]', '', sentence)
            new_sentences.append([sentence])
    if n == 1:
        return new_sentences  
    else: 
        return np.array(new_sentences)

# get the length of the smallest n gram
def get_gram_lentgh(uncommon_str_i):
    lens = []
    for i in range(len(uncommon_str_i[0])):
        temp = []
        for j in range(len(uncommon_str_i)):
            temp.append(len(uncommon_str_i[j][i]) if type(uncommon_str_i[j][i]) == list else 1)
        lens.append(min(temp))
    return lens

# get the original sentence in a vector form
def get_og_sentence_vector(uncommon_str, common_sentence):
    og_sentence_vector = []
    temp = common_sentence.split()
    i = 0    
    for t in temp:
        if t == "#":
            if type(uncommon_str[i]) == list:
                og_sentence_vector.extend(uncommon_str[i])
            else:
                og_sentence_vector.append(uncommon_str[i])
            i += 1
        else:
            og_sentence_vector.append(t)
    return og_sentence_vector

def init_list_of_lists(lenght):
    list_of_lists = []
    for i in range(lenght):
        list_of_lists.append([])
    return list_of_lists

# remove all the occourences of a value in a list
def remove_all(liste, value):
    while value in liste:
        liste.remove(value)
    return liste

def ngram_distribution(uncommon_str_i, common_sentence):
    # Initialize the list of lists that will contain the n-grams
    final_uncommon_str_i = init_list_of_lists(len(uncommon_str_i))

    nb_unc_str = 0

    lens = get_gram_lentgh(uncommon_str_i) # get the length of the smallest n grams

    for uncommon_str in uncommon_str_i:
        for i in range(len(uncommon_str)):
            # Make a copy of the current list of the current uncommon part for string 1
            unc_str = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]]
            og_sentence = get_og_sentence_vector(uncommon_str, common_sentence)
            temp_uncommon = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]]
            while len(unc_str) > lens[i]:
                
                bigram_measures = BigramAssocMeasures()

                # Variable containing the common words that won't allowed in the bigrams
                common_words_str = list(set(og_sentence) - set(unc_str))

                # Generate a list of all n-grams of size n for the sentence
                n_grams_str = list(ngrams(og_sentence, 2))
                
                # Use the bigram collocation finder to get the best bigrams for the sentence
                finder_str = BigramCollocationFinder.from_words(og_sentence)
                best_bigrams_str = finder_str.nbest(bigram_measures.pmi, len(n_grams_str))
                
                # Filter out bigrams that contain common words from the current list of uncommon words
                best_uncommon_ngrams_str = [ngram for ngram in best_bigrams_str if (not any(p_ngrams in ngram for p_ngrams in common_words_str))]
                
                # Generate the final list of uncommon n-grams for string 1 by filtering the filtered bigrams and remaining uncommon words
                uncommon_ngrams_str = [''] * len(unc_str)
                count1 = len(unc_str)
                count2 = 0
                # We loop through the best uncommon n-grams and check if they are in the uncommon words list
                for b in best_uncommon_ngrams_str:
                    if b[0] in unc_str and b[1] in unc_str: # if both words are in the uncommon words list
                        uncommon_ngrams_str[unc_str.index(b[0])] = " ".join(list(b)) # we add the n-gram to the final list
                        count2 += 1 # we increment the number of uncommon n-grams in the final list
                        # we remove the words of the bi-gram from the uncommon words list
                        unc_str[unc_str.index(b[0])] = '' 
                        unc_str[unc_str.index(b[1])] = ''
                        count1 -= 2 # we decrement the number of uncommon words in the uncommon words list
                    if count1 + count2 == lens[i]: # if we have the number of uncommon n-grams we want
                        break
                if unc_str != [""] * len(unc_str): # if there are still uncommon words left
                    for j in range(len(unc_str)):
                        if unc_str[j] != '':
                            uncommon_ngrams_str[j] = unc_str[j] # we add the uncommon words left to the final list
                uncommon_ngrams_str = remove_all(uncommon_ngrams_str, '') # we remove the empty strings from the final list
                unc_str = uncommon_ngrams_str.copy() # we update the current list of uncommon words
                og_sentence = unc_str.copy() # we update the current list of uncommon words
            
            final_uncommon_str_i[nb_unc_str].append(unc_str) # we add the final list of uncommon n-grams to the final list of lists
        nb_unc_str += 1 # we increment the number of uncommon parts
    return final_uncommon_str_i

# Reduce the sequences of # into one #
def shrink(sentence):
    temp = sentence.split()
    b = False
    for i in range(len(temp)):
        if temp[i] == "#" and b:
            temp[i] = ""
        elif temp[i] == "#" and not b:
            b = True
        elif temp[i] != "#" and b:
            b = False
    while "" in temp:       
        temp.remove("")
    
    return " ".join(temp)

def flatten(final_uncommon_str):
    flatten_final_uncommon_str = []
    for i in range(len(final_uncommon_str)):
        flatten_final_uncommon_str.append([item for sublist in final_uncommon_str[i] for item in sublist])
    return flatten_final_uncommon_str

# Init the Dynamic matrix
def init_matrix(temp_sentence, sentences, lenght, l):
        # initialize the L matrix with zeros
        L = [[0] * (lenght + 1) for _ in range(len(temp_sentence) + 1)]

        # fill in the L matrix using dynamic programming
        for i in range(len(temp_sentence) + 1):
            for j in range(lenght + 1):
                # if either string is empty, the longest common substring is zero
                if i == 0 or j == 0:
                    L[i][j] = 0
                # if the characters match, add one to the length of the longest common substring
                elif temp_sentence[i - 1] == sentences[l][j - 1]:
                    L[i][j] = L[i - 1][j - 1] + 1
                # if the characters don't match, take the maximum length from the previous row or column
                else:
                    L[i][j] = max(L[i - 1][j], L[i][j - 1])
        return L

# init list of lists
def init_list_of_lists(lenght):
    list_of_lists = []
    for i in range(lenght):
        list_of_lists.append([])
    return list_of_lists

# remove all the occourences of a value in a list
def remove_all(liste, value):
    while value in liste:
        liste.remove(value)
    return liste

# get last occurence of an element in a list
def get_last(liste, element):
    rev_list = liste.copy()
    rev_list.reverse()
    if element in rev_list:
        index = rev_list.index(element)
        return len(liste) - index - 1
    else : return -1


def common_and_uncommon_extraction(sentences):
    lens = [len(s) for s in sentences]

    # initialize the uncommon substring lists
    uncommon_str_i = init_list_of_lists(len(sentences))

    temp_sentence = sentences[0]
    for l in range(1, len(sentences)):
        # initialize the L matrix
        L = init_matrix(temp_sentence, sentences, lens[l], l)

        # calculate the index based on the length of the longer string
        index = len(temp_sentence) + lens[l]

        # initialize the common list with empty strings
        common = [""] * (index + 1)
        common[index] = ""

        # set i and j to the end of each string
        i = len(temp_sentence)
        j = lens[l]
        limit = abs(i - j)

        # trackers to follow the uncommon substrings position
        tracker_str1 = -1 
        tracker_str2 = -1
        # lists that save a sequence of uncommon substrings
        sub_uncommon_str = []
        sub_uncommon = []
        # final list that contains all the uncommon substrings
        sub_uncommon_str_i_temp = []
        sub_uncommon_str_temp = init_list_of_lists(len(sentences))

        # loop through the L matrix to find the common and uncommon substrings
        while i > 0 and j > 0:
            
            # if the characters match, add the character to the common list and move to the previous diagonal cell
            dist = abs(i - j)
            if temp_sentence[i - 1] == sentences[l][j - 1] and dist <= limit:
                common[index - 1] = temp_sentence[i - 1]
                i -= 1
                j -= 1
                index -= 1
            # if the length of the substring from the previous column is greater, add the uncommon character to uncommon_str list and move to the previous column
            elif L[i - 1][j] < L[i][j - 1]:
                if tracker_str1 == -1: # if the tracker is -1, it means that the substring is the first one
                    tracker_str1 = j - 1
                    sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the list
                elif tracker_str1 == j: # if the tracker is equal to the current index, it means that the substring is part of the same sequence
                    sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the sequence list
                    tracker_str1 = j - 1
                else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence
                    sub_uncommon_str.reverse() 
                    # add the sequence to the final list
                    none_index = get_last(uncommon_str_i[l], "")
                    if none_index != -1:
                        uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list
                    else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
                    sub_uncommon_str = [] # reset the sequence list
                    tracker_str1 = j - 1 # reset the tracker to the first uncommon string of the new sequence
                    sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon string to the new sequence list

                j -= 1 # move to the previous column
                common[index - 1] = "#"
                index -= 1
            # if the length of the substring from the previous row is greater, add the uncommon character to uncommon_str2 list and move to the previous row
            else:
                if tracker_str2 == -1: # if the tracker is -1, it means that the substring is the first one
                    tracker_str2 = i - 1
                    sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the list
                elif tracker_str2 == i: # if the tracker is equal to the current index, it means that the substring is part of the same sequence
                    sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the sequence list
                    tracker_str2 = i - 1
                else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence
                    sub_uncommon.reverse()
                    if l == 1: # if the index point to the second string, it means we are dealing with the first string so we add the sequence to the final list 
                        uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
                    else: # else it means that we are dealing with the common sentence 
                        if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly
                            #sub_uncommon.reverse()
                            # we add the uncommon substring to all the uncommon parts of all the previous strings
                            for k in range(l):
                                sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
                        else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it
                            sub_uncommon_copy = sub_uncommon.copy()
                            # we add the uncommon substring to a temp list to not mess up the order of the final list
                            sub_uncommon_str_i_temp.append(sub_uncommon_copy if len(sub_uncommon_copy) > 1 else sub_uncommon_copy[0])
                            for k in range(l):
                                sub_uncommon_copy = sub_uncommon.copy()
                                uwu = 1
                                while "#" in sub_uncommon_copy and len(sub_uncommon_str_i_temp) - uwu < len(uncommon_str_i[k]): # we loop through the uncommon substring and replace the # character with the uncommon substring
                                    # we get the last uncommon substring of the previous string
                                    updated_uncommon_str = uncommon_str_i[k][len(sub_uncommon_str_i_temp) - uwu]
                                    if type(updated_uncommon_str) == list: # if the last uncommon substring is a list, it means that it is a sequence so we need to update it
                                        owo = len(updated_uncommon_str) - 1
                                        while owo >= 0: # we loop through the sequence and replace the # character with the uncommon substring
                                            if '#' in sub_uncommon_copy:
                                                ind = max(loc for loc, val in enumerate(sub_uncommon_copy) if val == '#')
                                                sub_uncommon_copy[ind] = updated_uncommon_str[owo]
                                            owo -= 1
                                    else:
                                        ind = sub_uncommon_copy.index("#")
                                        sub_uncommon_copy[ind] = updated_uncommon_str
                                    uwu -= 1
                                if "#" in sub_uncommon_copy:
                                    sub_uncommon_copy = remove_all(sub_uncommon_copy, '#') # we remove all the # characters that are left
                                sub_uncommon_str_temp[k].append(sub_uncommon_copy if len(sub_uncommon) > 1 else sub_uncommon_copy[0]) # we add the updated uncommon substring to the final list                   
                    sub_uncommon = [] # reset the sequence list
                    tracker_str2 = i - 1 # reset the tracker to the first uncommon string of the new sequence
                    sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon string to the new sequence list
                    uncommon_str_i[l].append("")

                common[index - 1] = "#" # add the # character to the common substring to indicate that an uncommon substring is there
                index -= 1 # move to the previous row
                i  -= 1 # move to the next string
        
        if l == 1: # if the index point to the second string, it means we are dealing with the first string 
            if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left
                sub_uncommon.reverse()
                uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) # add the uncommon substring to the final list
        else: # else it means that we are dealing with the common sentence
            if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left
                if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly
                    sub_uncommon.reverse()
                    for k in range(l):
                        sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
                else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it
                    sub_uncommon.reverse()
                    for k in range(l):
                        sub_uncommon_copy = sub_uncommon.copy()
                        if len(sub_uncommon_copy) < 2: # if the length of the uncommon substring is less than 2, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring
                            sub_uncommon_copy = uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0] if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1]) == list else uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
                        else: # if the length of the uncommon substring is greater than 2, it means that it is a sequence so we need to update it
                            uwu = 1
                            while "#" in sub_uncommon_copy and len(uncommon_str_i[k]) - uwu >= 0: # we loop through the uncommon substring and replace the # character with the uncommon substring
                                if type(uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]) == list :
                                    # we loop through the terms of the sequence that needs to be updated and replace the # character with the uncommon substring
                                    for term in uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]:
                                        if '#' in sub_uncommon_copy:
                                            ind = sub_uncommon_copy.index("#")
                                        sub_uncommon_copy[ind] = term
                                else: # if the last uncommon substring is not a list, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring
                                    ind = sub_uncommon_copy.index("#")
                                    sub_uncommon_copy[ind] = uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
                                uwu += 1
                        
                        
                        if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0]) == list : sub_uncommon_copy = remove_all(sub_uncommon_copy, "#") # we remove all the # characters that are left
                        sub_uncommon_str_temp[k].append(sub_uncommon_copy) # we add the updated uncommon substring to the final list
            # we add the uncommon substring to all the uncommon parts of all the previous strings
            for k in range(l):
                checking = shrink(" ".join(common)).split("#")
                nu = len(checking) - 1
                if temp_sentence[0] == "#":
                    nu += 1
                if len(sub_uncommon_str_temp[k]) < nu:
                    for q in range(0, len(uncommon_str_i[k]) - len(sub_uncommon_str_temp[k])):
                        sub_uncommon_str_temp[k].insert(0, uncommon_str_i[k][q])
                uncommon_str_i[k] = sub_uncommon_str_temp[k]
        
        if i != 0:
            temp_i = i
            sub_uncommon_str2 = [] # reset the sequence list
            while i > 0:
                sub_uncommon_str2.append(temp_sentence[i - 1])
                i -= 1
            sub_uncommon_str2.reverse()
            # add the sequence to the final list
            for k in range(l):
                if temp_i < len(temp_sentence):
                    if temp_sentence[temp_i] == "#":
                        f_unc = uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
                        uncommon_str_i[k].remove(f_unc)
                        sub_uncommon_str2.extend(f_unc)
                uncommon_str_i[k].append(sub_uncommon_str2 if len(sub_uncommon_str2) > 1 else sub_uncommon_str2[0])
                uncommon_str_i[k] = remove_all(uncommon_str_i[k], "#")
            if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]):
                common.insert(0, "#")

        # we add the uncommon substring left to the current string
        if len(sub_uncommon_str) > 0:
            sub_uncommon_str.reverse()
            none_index = get_last(uncommon_str_i[l], "")
            if none_index != -1:
                uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list
            else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
            if len(uncommon_str_i[l]) < len(uncommon_str_i[l - 1]):
                uncommon_str_i[l].append("")

        if j != 0:
            sub_uncommon_str = [] # reset the sequence list
            while len(uncommon_str_i[l]) + 1 > len(uncommon_str_i[l - 1]) and "" in uncommon_str_i[l]:
                uncommon_str_i[l].remove("")
            while j > 0:
                sub_uncommon_str.append(sentences[l][j - 1])
                j -= 1
            sub_uncommon_str.reverse()
            # add the sequence to the final list
            uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
            if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]):
                common.insert(0, "#")

        temp_sentence = remove_all(common.copy(), "") # we update the common sentence

        for rt in range(0, l):
            while len(uncommon_str_i[l]) != len(uncommon_str_i[rt]):
                if len(uncommon_str_i[l]) < len(uncommon_str_i[rt]):
                    uncommon_str_i[l].append("")
                else:
                    uncommon_str_i[rt].append("")

        if len(uncommon_str_i[l]) != len(shrink(" ".join(common)).split("#")) - 1:
            for rt in range(0, l+1):
                if len(uncommon_str_i[rt]) < len(shrink(" ".join(common)).split("#")) - 1:
                    uncommon_str_i[rt].append("")

        # N-gram distribution on the uncommon parts
        uncommon_str_i[0:l+1] = ngram_distribution(uncommon_str_i[0:l+1], shrink(" ".join(temp_sentence)))
        temp_sentence = shrink(" ".join(temp_sentence))

        # update the distribution of the uncommon parts based on the N-gram distribution
        for i in range(len(uncommon_str_i[0]), 0, -1):
            mask = "$ " * len(uncommon_str_i[0][i-1])
            temp_sentence = temp_sentence.replace("#", mask, 1)
        temp_sentence = temp_sentence.replace("$", "#")
        temp_sentence = temp_sentence.split(" ")
        temp_sentence = remove_all(temp_sentence, "")

    # join the common list into a sentence
    common_sentence = " ".join(temp_sentence)
    # replace the # character with the [MASK] token
    common_sentence = common_sentence.replace("#", "[MASK]")

    # reverse the order of the uncommon substring lists
    for i in range(len(uncommon_str_i)):
        uncommon_str_i[i].reverse()

    # return the common sentence and the lists of uncommon substrings
    return common_sentence, uncommon_str_i

def text_mining_algorithm(sentences):
    tokenized_sentences = preprocess(sentences, 1)

    common_words, uncommon_words = common_and_uncommon_extraction(tokenized_sentences)

    return common_words, uncommon_words

def similarity_analysis(masked_sentence, final_uncommon_str, nlp, fill_mask):
    i = 0
    while "[MASK]" in masked_sentence:
        # MLM with BERT
        pred = fill_mask(masked_sentence)
        # Similarity between the masked words and the uncommon words with word embeddings
        #nlp = spacy.load("en_core_web_md")  
        if type(pred[0]) == list:
            df1 = pd.DataFrame(pred[0])
        else:
            df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
        word_list = df1["token_str"].tolist() # Get the list of words from the dataframe
        # Get the list of uncommon words for the current masked word
        strings = []
        for fus in final_uncommon_str:
            strings.append(fus[i])

        # Get the similarity between the masked word and the uncommon words
        similarity = []
        for s in strings:
            similarity.append(np.mean([nlp(w).similarity(nlp(s)) for w in word_list]))

        # Select the uncommon word with the highest similarity
        selected_word = strings[np.argmax(similarity)]
        masked_sentence = masked_sentence.replace("[MASK]", selected_word, 1)
        i += 1
    
    return masked_sentence

def text_combining(texts, nlp, fill_mask):
    masked_sentence, uncommon_words = text_mining_algorithm(texts)

    combined_sentence = similarity_analysis(masked_sentence, flatten(uncommon_words), nlp, fill_mask)

    return combined_sentence

if __name__ == "__main__":
    nlp = spacy.load("en_core_web_md")
    fill_mask = pipeline("fill-mask", model="distilbert-base-uncased")
    sentence1 = "I love to pay my video games in my free time, especially retro video games."
    sentence2 = "I love to play oreo games in my free thyme, especially retro video games."
    sentence3 = "Ay live to slay video vames in my free time, especially utro video games."
    sentences = np.array([[sentence1], [sentence2], [sentence3]])
    print(text_combining(sentences, nlp, fill_mask))