import numpy as np import nltk import re import spacy import pandas as pd from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder from nltk.util import ngrams import transformers from transformers import pipeline from transformers import logging logging.set_verbosity_error() import warnings warnings.filterwarnings('ignore') # Remove punctuation def preprocess(sentences, n): new_sentences = [] for sentence in sentences: sentence = sentence[0] sentence = sentence.lower() if n == 1: sentence = nltk.RegexpTokenizer(r'\w+').tokenize(sentence) new_sentences.append(sentence) else : sentence = re.sub(r'[^\w\s]', '', sentence) new_sentences.append([sentence]) if n == 1: return new_sentences else: return np.array(new_sentences) # get the length of the smallest n gram def get_gram_lentgh(uncommon_str_i): lens = [] for i in range(len(uncommon_str_i[0])): temp = [] for j in range(len(uncommon_str_i)): temp.append(len(uncommon_str_i[j][i]) if type(uncommon_str_i[j][i]) == list else 1) lens.append(min(temp)) return lens # get the original sentence in a vector form def get_og_sentence_vector(uncommon_str, common_sentence): og_sentence_vector = [] temp = common_sentence.split() i = 0 for t in temp: if t == "#": if type(uncommon_str[i]) == list: og_sentence_vector.extend(uncommon_str[i]) else: og_sentence_vector.append(uncommon_str[i]) i += 1 else: og_sentence_vector.append(t) return og_sentence_vector def init_list_of_lists(lenght): list_of_lists = [] for i in range(lenght): list_of_lists.append([]) return list_of_lists # remove all the occourences of a value in a list def remove_all(liste, value): while value in liste: liste.remove(value) return liste def ngram_distribution(uncommon_str_i, common_sentence): # Initialize the list of lists that will contain the n-grams final_uncommon_str_i = init_list_of_lists(len(uncommon_str_i)) nb_unc_str = 0 lens = get_gram_lentgh(uncommon_str_i) # get the length of the smallest n grams for uncommon_str in uncommon_str_i: for i in range(len(uncommon_str)): # Make a copy of the current list of the current uncommon part for string 1 unc_str = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]] og_sentence = get_og_sentence_vector(uncommon_str, common_sentence) temp_uncommon = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]] while len(unc_str) > lens[i]: bigram_measures = BigramAssocMeasures() # Variable containing the common words that won't allowed in the bigrams common_words_str = list(set(og_sentence) - set(unc_str)) # Generate a list of all n-grams of size n for the sentence n_grams_str = list(ngrams(og_sentence, 2)) # Use the bigram collocation finder to get the best bigrams for the sentence finder_str = BigramCollocationFinder.from_words(og_sentence) best_bigrams_str = finder_str.nbest(bigram_measures.pmi, len(n_grams_str)) # Filter out bigrams that contain common words from the current list of uncommon words best_uncommon_ngrams_str = [ngram for ngram in best_bigrams_str if (not any(p_ngrams in ngram for p_ngrams in common_words_str))] # Generate the final list of uncommon n-grams for string 1 by filtering the filtered bigrams and remaining uncommon words uncommon_ngrams_str = [''] * len(unc_str) count1 = len(unc_str) count2 = 0 # We loop through the best uncommon n-grams and check if they are in the uncommon words list for b in best_uncommon_ngrams_str: if b[0] in unc_str and b[1] in unc_str: # if both words are in the uncommon words list uncommon_ngrams_str[unc_str.index(b[0])] = " ".join(list(b)) # we add the n-gram to the final list count2 += 1 # we increment the number of uncommon n-grams in the final list # we remove the words of the bi-gram from the uncommon words list unc_str[unc_str.index(b[0])] = '' unc_str[unc_str.index(b[1])] = '' count1 -= 2 # we decrement the number of uncommon words in the uncommon words list if count1 + count2 == lens[i]: # if we have the number of uncommon n-grams we want break if unc_str != [""] * len(unc_str): # if there are still uncommon words left for j in range(len(unc_str)): if unc_str[j] != '': uncommon_ngrams_str[j] = unc_str[j] # we add the uncommon words left to the final list uncommon_ngrams_str = remove_all(uncommon_ngrams_str, '') # we remove the empty strings from the final list unc_str = uncommon_ngrams_str.copy() # we update the current list of uncommon words og_sentence = unc_str.copy() # we update the current list of uncommon words final_uncommon_str_i[nb_unc_str].append(unc_str) # we add the final list of uncommon n-grams to the final list of lists nb_unc_str += 1 # we increment the number of uncommon parts return final_uncommon_str_i # Reduce the sequences of # into one # def shrink(sentence): temp = sentence.split() b = False for i in range(len(temp)): if temp[i] == "#" and b: temp[i] = "" elif temp[i] == "#" and not b: b = True elif temp[i] != "#" and b: b = False while "" in temp: temp.remove("") return " ".join(temp) def flatten(final_uncommon_str): flatten_final_uncommon_str = [] for i in range(len(final_uncommon_str)): flatten_final_uncommon_str.append([item for sublist in final_uncommon_str[i] for item in sublist]) return flatten_final_uncommon_str # Init the Dynamic matrix def init_matrix(temp_sentence, sentences, lenght, l): # initialize the L matrix with zeros L = [[0] * (lenght + 1) for _ in range(len(temp_sentence) + 1)] # fill in the L matrix using dynamic programming for i in range(len(temp_sentence) + 1): for j in range(lenght + 1): # if either string is empty, the longest common substring is zero if i == 0 or j == 0: L[i][j] = 0 # if the characters match, add one to the length of the longest common substring elif temp_sentence[i - 1] == sentences[l][j - 1]: L[i][j] = L[i - 1][j - 1] + 1 # if the characters don't match, take the maximum length from the previous row or column else: L[i][j] = max(L[i - 1][j], L[i][j - 1]) return L # init list of lists def init_list_of_lists(lenght): list_of_lists = [] for i in range(lenght): list_of_lists.append([]) return list_of_lists # remove all the occourences of a value in a list def remove_all(liste, value): while value in liste: liste.remove(value) return liste # get last occurence of an element in a list def get_last(liste, element): rev_list = liste.copy() rev_list.reverse() if element in rev_list: index = rev_list.index(element) return len(liste) - index - 1 else : return -1 def common_and_uncommon_extraction(sentences): lens = [len(s) for s in sentences] # initialize the uncommon substring lists uncommon_str_i = init_list_of_lists(len(sentences)) temp_sentence = sentences[0] for l in range(1, len(sentences)): # initialize the L matrix L = init_matrix(temp_sentence, sentences, lens[l], l) # calculate the index based on the length of the longer string index = len(temp_sentence) + lens[l] # initialize the common list with empty strings common = [""] * (index + 1) common[index] = "" # set i and j to the end of each string i = len(temp_sentence) j = lens[l] limit = abs(i - j) # trackers to follow the uncommon substrings position tracker_str1 = -1 tracker_str2 = -1 # lists that save a sequence of uncommon substrings sub_uncommon_str = [] sub_uncommon = [] # final list that contains all the uncommon substrings sub_uncommon_str_i_temp = [] sub_uncommon_str_temp = init_list_of_lists(len(sentences)) # loop through the L matrix to find the common and uncommon substrings while i > 0 and j > 0: # if the characters match, add the character to the common list and move to the previous diagonal cell dist = abs(i - j) if temp_sentence[i - 1] == sentences[l][j - 1] and dist <= limit: common[index - 1] = temp_sentence[i - 1] i -= 1 j -= 1 index -= 1 # if the length of the substring from the previous column is greater, add the uncommon character to uncommon_str list and move to the previous column elif L[i - 1][j] < L[i][j - 1]: if tracker_str1 == -1: # if the tracker is -1, it means that the substring is the first one tracker_str1 = j - 1 sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the list elif tracker_str1 == j: # if the tracker is equal to the current index, it means that the substring is part of the same sequence sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the sequence list tracker_str1 = j - 1 else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence sub_uncommon_str.reverse() # add the sequence to the final list none_index = get_last(uncommon_str_i[l], "") if none_index != -1: uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0]) sub_uncommon_str = [] # reset the sequence list tracker_str1 = j - 1 # reset the tracker to the first uncommon string of the new sequence sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon string to the new sequence list j -= 1 # move to the previous column common[index - 1] = "#" index -= 1 # if the length of the substring from the previous row is greater, add the uncommon character to uncommon_str2 list and move to the previous row else: if tracker_str2 == -1: # if the tracker is -1, it means that the substring is the first one tracker_str2 = i - 1 sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the list elif tracker_str2 == i: # if the tracker is equal to the current index, it means that the substring is part of the same sequence sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the sequence list tracker_str2 = i - 1 else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence sub_uncommon.reverse() if l == 1: # if the index point to the second string, it means we are dealing with the first string so we add the sequence to the final list uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) else: # else it means that we are dealing with the common sentence if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly #sub_uncommon.reverse() # we add the uncommon substring to all the uncommon parts of all the previous strings for k in range(l): sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it sub_uncommon_copy = sub_uncommon.copy() # we add the uncommon substring to a temp list to not mess up the order of the final list sub_uncommon_str_i_temp.append(sub_uncommon_copy if len(sub_uncommon_copy) > 1 else sub_uncommon_copy[0]) for k in range(l): sub_uncommon_copy = sub_uncommon.copy() uwu = 1 while "#" in sub_uncommon_copy and len(sub_uncommon_str_i_temp) - uwu < len(uncommon_str_i[k]): # we loop through the uncommon substring and replace the # character with the uncommon substring # we get the last uncommon substring of the previous string updated_uncommon_str = uncommon_str_i[k][len(sub_uncommon_str_i_temp) - uwu] if type(updated_uncommon_str) == list: # if the last uncommon substring is a list, it means that it is a sequence so we need to update it owo = len(updated_uncommon_str) - 1 while owo >= 0: # we loop through the sequence and replace the # character with the uncommon substring if '#' in sub_uncommon_copy: ind = max(loc for loc, val in enumerate(sub_uncommon_copy) if val == '#') sub_uncommon_copy[ind] = updated_uncommon_str[owo] owo -= 1 else: ind = sub_uncommon_copy.index("#") sub_uncommon_copy[ind] = updated_uncommon_str uwu -= 1 if "#" in sub_uncommon_copy: sub_uncommon_copy = remove_all(sub_uncommon_copy, '#') # we remove all the # characters that are left sub_uncommon_str_temp[k].append(sub_uncommon_copy if len(sub_uncommon) > 1 else sub_uncommon_copy[0]) # we add the updated uncommon substring to the final list sub_uncommon = [] # reset the sequence list tracker_str2 = i - 1 # reset the tracker to the first uncommon string of the new sequence sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon string to the new sequence list uncommon_str_i[l].append("") common[index - 1] = "#" # add the # character to the common substring to indicate that an uncommon substring is there index -= 1 # move to the previous row i -= 1 # move to the next string if l == 1: # if the index point to the second string, it means we are dealing with the first string if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left sub_uncommon.reverse() uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) # add the uncommon substring to the final list else: # else it means that we are dealing with the common sentence if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly sub_uncommon.reverse() for k in range(l): sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it sub_uncommon.reverse() for k in range(l): sub_uncommon_copy = sub_uncommon.copy() if len(sub_uncommon_copy) < 2: # if the length of the uncommon substring is less than 2, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring sub_uncommon_copy = uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0] if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1]) == list else uncommon_str_i[k][len(uncommon_str_i[k]) - 1] else: # if the length of the uncommon substring is greater than 2, it means that it is a sequence so we need to update it uwu = 1 while "#" in sub_uncommon_copy and len(uncommon_str_i[k]) - uwu >= 0: # we loop through the uncommon substring and replace the # character with the uncommon substring if type(uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]) == list : # we loop through the terms of the sequence that needs to be updated and replace the # character with the uncommon substring for term in uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]: if '#' in sub_uncommon_copy: ind = sub_uncommon_copy.index("#") sub_uncommon_copy[ind] = term else: # if the last uncommon substring is not a list, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring ind = sub_uncommon_copy.index("#") sub_uncommon_copy[ind] = uncommon_str_i[k][len(uncommon_str_i[k]) - 1] uwu += 1 if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0]) == list : sub_uncommon_copy = remove_all(sub_uncommon_copy, "#") # we remove all the # characters that are left sub_uncommon_str_temp[k].append(sub_uncommon_copy) # we add the updated uncommon substring to the final list # we add the uncommon substring to all the uncommon parts of all the previous strings for k in range(l): checking = shrink(" ".join(common)).split("#") nu = len(checking) - 1 if temp_sentence[0] == "#": nu += 1 if len(sub_uncommon_str_temp[k]) < nu: for q in range(0, len(uncommon_str_i[k]) - len(sub_uncommon_str_temp[k])): sub_uncommon_str_temp[k].insert(0, uncommon_str_i[k][q]) uncommon_str_i[k] = sub_uncommon_str_temp[k] if i != 0: temp_i = i sub_uncommon_str2 = [] # reset the sequence list while i > 0: sub_uncommon_str2.append(temp_sentence[i - 1]) i -= 1 sub_uncommon_str2.reverse() # add the sequence to the final list for k in range(l): if temp_i < len(temp_sentence): if temp_sentence[temp_i] == "#": f_unc = uncommon_str_i[k][len(uncommon_str_i[k]) - 1] uncommon_str_i[k].remove(f_unc) sub_uncommon_str2.extend(f_unc) uncommon_str_i[k].append(sub_uncommon_str2 if len(sub_uncommon_str2) > 1 else sub_uncommon_str2[0]) uncommon_str_i[k] = remove_all(uncommon_str_i[k], "#") if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]): common.insert(0, "#") # we add the uncommon substring left to the current string if len(sub_uncommon_str) > 0: sub_uncommon_str.reverse() none_index = get_last(uncommon_str_i[l], "") if none_index != -1: uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0]) if len(uncommon_str_i[l]) < len(uncommon_str_i[l - 1]): uncommon_str_i[l].append("") if j != 0: sub_uncommon_str = [] # reset the sequence list while len(uncommon_str_i[l]) + 1 > len(uncommon_str_i[l - 1]) and "" in uncommon_str_i[l]: uncommon_str_i[l].remove("") while j > 0: sub_uncommon_str.append(sentences[l][j - 1]) j -= 1 sub_uncommon_str.reverse() # add the sequence to the final list uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0]) if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]): common.insert(0, "#") temp_sentence = remove_all(common.copy(), "") # we update the common sentence for rt in range(0, l): while len(uncommon_str_i[l]) != len(uncommon_str_i[rt]): if len(uncommon_str_i[l]) < len(uncommon_str_i[rt]): uncommon_str_i[l].append("") else: uncommon_str_i[rt].append("") if len(uncommon_str_i[l]) != len(shrink(" ".join(common)).split("#")) - 1: for rt in range(0, l+1): if len(uncommon_str_i[rt]) < len(shrink(" ".join(common)).split("#")) - 1: uncommon_str_i[rt].append("") # N-gram distribution on the uncommon parts uncommon_str_i[0:l+1] = ngram_distribution(uncommon_str_i[0:l+1], shrink(" ".join(temp_sentence))) temp_sentence = shrink(" ".join(temp_sentence)) # update the distribution of the uncommon parts based on the N-gram distribution for i in range(len(uncommon_str_i[0]), 0, -1): mask = "$ " * len(uncommon_str_i[0][i-1]) temp_sentence = temp_sentence.replace("#", mask, 1) temp_sentence = temp_sentence.replace("$", "#") temp_sentence = temp_sentence.split(" ") temp_sentence = remove_all(temp_sentence, "") # join the common list into a sentence common_sentence = " ".join(temp_sentence) # replace the # character with the [MASK] token common_sentence = common_sentence.replace("#", "[MASK]") # reverse the order of the uncommon substring lists for i in range(len(uncommon_str_i)): uncommon_str_i[i].reverse() # return the common sentence and the lists of uncommon substrings return common_sentence, uncommon_str_i def text_mining_algorithm(sentences): tokenized_sentences = preprocess(sentences, 1) common_words, uncommon_words = common_and_uncommon_extraction(tokenized_sentences) return common_words, uncommon_words def similarity_analysis(masked_sentence, final_uncommon_str, nlp, fill_mask): i = 0 while "[MASK]" in masked_sentence: # MLM with BERT pred = fill_mask(masked_sentence) # Similarity between the masked words and the uncommon words with word embeddings #nlp = spacy.load("en_core_web_md") if type(pred[0]) == list: df1 = pd.DataFrame(pred[0]) else: df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe word_list = df1["token_str"].tolist() # Get the list of words from the dataframe # Get the list of uncommon words for the current masked word strings = [] for fus in final_uncommon_str: strings.append(fus[i]) # Get the similarity between the masked word and the uncommon words similarity = [] for s in strings: similarity.append(np.mean([nlp(w).similarity(nlp(s)) for w in word_list])) # Select the uncommon word with the highest similarity selected_word = strings[np.argmax(similarity)] masked_sentence = masked_sentence.replace("[MASK]", selected_word, 1) i += 1 return masked_sentence def text_combining(texts, nlp, fill_mask): masked_sentence, uncommon_words = text_mining_algorithm(texts) combined_sentence = similarity_analysis(masked_sentence, flatten(uncommon_words), nlp, fill_mask) return combined_sentence if __name__ == "__main__": nlp = spacy.load("en_core_web_md") fill_mask = pipeline("fill-mask", model="distilbert-base-uncased") sentence1 = "I love to pay my video games in my free time, especially retro video games." sentence2 = "I love to play oreo games in my free thyme, especially retro video games." sentence3 = "Ay live to slay video vames in my free time, especially utro video games." sentences = np.array([[sentence1], [sentence2], [sentence3]]) print(text_combining(sentences, nlp, fill_mask))