Spaces:

Galiess
/

AdvancedGenius

Runtime error

File size: 26,489 Bytes

import numpy as np
import nltk
import re
import spacy
import pandas as pd
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from nltk.util import ngrams

import transformers
from transformers import pipeline
from transformers import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings('ignore')

# Remove punctuation
def preprocess(sentences, n):
    new_sentences = []
    for sentence in sentences:
        sentence = sentence[0]
        sentence = sentence.lower()
        if n == 1:
            sentence = nltk.RegexpTokenizer(r'\w+').tokenize(sentence)
            new_sentences.append(sentence)
        else :
            sentence = re.sub(r'[^\w\s]', '', sentence)
            new_sentences.append([sentence])
    if n == 1:
        return new_sentences  
    else: 
        return np.array(new_sentences)

# get the length of the smallest n gram
def get_gram_lentgh(uncommon_str_i):
    lens = []
    for i in range(len(uncommon_str_i[0])):
        temp = []
        for j in range(len(uncommon_str_i)):
            temp.append(len(uncommon_str_i[j][i]) if type(uncommon_str_i[j][i]) == list else 1)
        lens.append(min(temp))
    return lens

# get the original sentence in a vector form
def get_og_sentence_vector(uncommon_str, common_sentence):
    og_sentence_vector = []
    temp = common_sentence.split()
    i = 0    
    for t in temp:
        if t == "#":
            if type(uncommon_str[i]) == list:
                og_sentence_vector.extend(uncommon_str[i])
            else:
                og_sentence_vector.append(uncommon_str[i])
            i += 1
        else:
            og_sentence_vector.append(t)
    return og_sentence_vector

def init_list_of_lists(lenght):
    list_of_lists = []
    for i in range(lenght):
        list_of_lists.append([])
    return list_of_lists

# remove all the occourences of a value in a list
def remove_all(liste, value):
    while value in liste:
        liste.remove(value)
    return liste

def ngram_distribution(uncommon_str_i, common_sentence):
    # Initialize the list of lists that will contain the n-grams
    final_uncommon_str_i = init_list_of_lists(len(uncommon_str_i))

    nb_unc_str = 0

    lens = get_gram_lentgh(uncommon_str_i) # get the length of the smallest n grams

    for uncommon_str in uncommon_str_i:
        for i in range(len(uncommon_str)):
            # Make a copy of the current list of the current uncommon part for string 1
            unc_str = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]]
            og_sentence = get_og_sentence_vector(uncommon_str, common_sentence)
            temp_uncommon = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]]
            while len(unc_str) > lens[i]:
                
                bigram_measures = BigramAssocMeasures()

                # Variable containing the common words that won't allowed in the bigrams
                common_words_str = list(set(og_sentence) - set(unc_str))

                # Generate a list of all n-grams of size n for the sentence
                n_grams_str = list(ngrams(og_sentence, 2))
                
                # Use the bigram collocation finder to get the best bigrams for the sentence
                finder_str = BigramCollocationFinder.from_words(og_sentence)
                best_bigrams_str = finder_str.nbest(bigram_measures.pmi, len(n_grams_str))
                
                # Filter out bigrams that contain common words from the current list of uncommon words
                best_uncommon_ngrams_str = [ngram for ngram in best_bigrams_str if (not any(p_ngrams in ngram for p_ngrams in common_words_str))]
                
                # Generate the final list of uncommon n-grams for string 1 by filtering the filtered bigrams and remaining uncommon words
                uncommon_ngrams_str = [''] * len(unc_str)
                count1 = len(unc_str)
                count2 = 0
                # We loop through the best uncommon n-grams and check if they are in the uncommon words list
                for b in best_uncommon_ngrams_str:
                    if b[0] in unc_str and b[1] in unc_str: # if both words are in the uncommon words list
                        uncommon_ngrams_str[unc_str.index(b[0])] = " ".join(list(b)) # we add the n-gram to the final list
                        count2 += 1 # we increment the number of uncommon n-grams in the final list
                        # we remove the words of the bi-gram from the uncommon words list
                        unc_str[unc_str.index(b[0])] = '' 
                        unc_str[unc_str.index(b[1])] = ''
                        count1 -= 2 # we decrement the number of uncommon words in the uncommon words list
                    if count1 + count2 == lens[i]: # if we have the number of uncommon n-grams we want
                        break
                if unc_str != [""] * len(unc_str): # if there are still uncommon words left
                    for j in range(len(unc_str)):
                        if unc_str[j] != '':
                            uncommon_ngrams_str[j] = unc_str[j] # we add the uncommon words left to the final list
                uncommon_ngrams_str = remove_all(uncommon_ngrams_str, '') # we remove the empty strings from the final list
                unc_str = uncommon_ngrams_str.copy() # we update the current list of uncommon words
                og_sentence = unc_str.copy() # we update the current list of uncommon words
            
            final_uncommon_str_i[nb_unc_str].append(unc_str) # we add the final list of uncommon n-grams to the final list of lists
        nb_unc_str += 1 # we increment the number of uncommon parts
    return final_uncommon_str_i

# Reduce the sequences of # into one #
def shrink(sentence):
    temp = sentence.split()
    b = False
    for i in range(len(temp)):
        if temp[i] == "#" and b:
            temp[i] = ""
        elif temp[i] == "#" and not b:
            b = True
        elif temp[i] != "#" and b:
            b = False
    while "" in temp:       
        temp.remove("")
    
    return " ".join(temp)

def flatten(final_uncommon_str):
    flatten_final_uncommon_str = []
    for i in range(len(final_uncommon_str)):
        flatten_final_uncommon_str.append([item for sublist in final_uncommon_str[i] for item in sublist])
    return flatten_final_uncommon_str

# Init the Dynamic matrix
def init_matrix(temp_sentence, sentences, lenght, l):
        # initialize the L matrix with zeros
        L = [[0] * (lenght + 1) for _ in range(len(temp_sentence) + 1)]

        # fill in the L matrix using dynamic programming
        for i in range(len(temp_sentence) + 1):
            for j in range(lenght + 1):
                # if either string is empty, the longest common substring is zero
                if i == 0 or j == 0:
                    L[i][j] = 0
                # if the characters match, add one to the length of the longest common substring
                elif temp_sentence[i - 1] == sentences[l][j - 1]:
                    L[i][j] = L[i - 1][j - 1] + 1
                # if the characters don't match, take the maximum length from the previous row or column
                else:
                    L[i][j] = max(L[i - 1][j], L[i][j - 1])
        return L

# init list of lists
def init_list_of_lists(lenght):
    list_of_lists = []
    for i in range(lenght):
        list_of_lists.append([])
    return list_of_lists

# remove all the occourences of a value in a list
def remove_all(liste, value):
    while value in liste:
        liste.remove(value)
    return liste

# get last occurence of an element in a list
def get_last(liste, element):
    rev_list = liste.copy()
    rev_list.reverse()
    if element in rev_list:
        index = rev_list.index(element)
        return len(liste) - index - 1
    else : return -1


def common_and_uncommon_extraction(sentences):
    lens = [len(s) for s in sentences]

    # initialize the uncommon substring lists
    uncommon_str_i = init_list_of_lists(len(sentences))

    temp_sentence = sentences[0]
    for l in range(1, len(sentences)):
        # initialize the L matrix
        L = init_matrix(temp_sentence, sentences, lens[l], l)

        # calculate the index based on the length of the longer string
        index = len(temp_sentence) + lens[l]

        # initialize the common list with empty strings
        common = [""] * (index + 1)
        common[index] = ""

        # set i and j to the end of each string
        i = len(temp_sentence)
        j = lens[l]
        limit = abs(i - j)

        # trackers to follow the uncommon substrings position
        tracker_str1 = -1 
        tracker_str2 = -1
        # lists that save a sequence of uncommon substrings
        sub_uncommon_str = []
        sub_uncommon = []
        # final list that contains all the uncommon substrings
        sub_uncommon_str_i_temp = []
        sub_uncommon_str_temp = init_list_of_lists(len(sentences))

        # loop through the L matrix to find the common and uncommon substrings
        while i > 0 and j > 0:
            
            # if the characters match, add the character to the common list and move to the previous diagonal cell
            dist = abs(i - j)
            if temp_sentence[i - 1] == sentences[l][j - 1] and dist <= limit:
                common[index - 1] = temp_sentence[i - 1]
                i -= 1
                j -= 1
                index -= 1
            # if the length of the substring from the previous column is greater, add the uncommon character to uncommon_str list and move to the previous column
            elif L[i - 1][j] < L[i][j - 1]:
                if tracker_str1 == -1: # if the tracker is -1, it means that the substring is the first one
                    tracker_str1 = j - 1
                    sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the list
                elif tracker_str1 == j: # if the tracker is equal to the current index, it means that the substring is part of the same sequence
                    sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the sequence list
                    tracker_str1 = j - 1
                else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence
                    sub_uncommon_str.reverse() 
                    # add the sequence to the final list
                    none_index = get_last(uncommon_str_i[l], "")
                    if none_index != -1:
                        uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list
                    else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
                    sub_uncommon_str = [] # reset the sequence list
                    tracker_str1 = j - 1 # reset the tracker to the first uncommon string of the new sequence
                    sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon string to the new sequence list

                j -= 1 # move to the previous column
                common[index - 1] = "#"
                index -= 1
            # if the length of the substring from the previous row is greater, add the uncommon character to uncommon_str2 list and move to the previous row
            else:
                if tracker_str2 == -1: # if the tracker is -1, it means that the substring is the first one
                    tracker_str2 = i - 1
                    sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the list
                elif tracker_str2 == i: # if the tracker is equal to the current index, it means that the substring is part of the same sequence
                    sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the sequence list
                    tracker_str2 = i - 1
                else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence
                    sub_uncommon.reverse()
                    if l == 1: # if the index point to the second string, it means we are dealing with the first string so we add the sequence to the final list 
                        uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
                    else: # else it means that we are dealing with the common sentence 
                        if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly
                            #sub_uncommon.reverse()
                            # we add the uncommon substring to all the uncommon parts of all the previous strings
                            for k in range(l):
                                sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
                        else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it
                            sub_uncommon_copy = sub_uncommon.copy()
                            # we add the uncommon substring to a temp list to not mess up the order of the final list
                            sub_uncommon_str_i_temp.append(sub_uncommon_copy if len(sub_uncommon_copy) > 1 else sub_uncommon_copy[0])
                            for k in range(l):
                                sub_uncommon_copy = sub_uncommon.copy()
                                uwu = 1
                                while "#" in sub_uncommon_copy and len(sub_uncommon_str_i_temp) - uwu < len(uncommon_str_i[k]): # we loop through the uncommon substring and replace the # character with the uncommon substring
                                    # we get the last uncommon substring of the previous string
                                    updated_uncommon_str = uncommon_str_i[k][len(sub_uncommon_str_i_temp) - uwu]
                                    if type(updated_uncommon_str) == list: # if the last uncommon substring is a list, it means that it is a sequence so we need to update it
                                        owo = len(updated_uncommon_str) - 1
                                        while owo >= 0: # we loop through the sequence and replace the # character with the uncommon substring
                                            if '#' in sub_uncommon_copy:
                                                ind = max(loc for loc, val in enumerate(sub_uncommon_copy) if val == '#')
                                                sub_uncommon_copy[ind] = updated_uncommon_str[owo]
                                            owo -= 1
                                    else:
                                        ind = sub_uncommon_copy.index("#")
                                        sub_uncommon_copy[ind] = updated_uncommon_str
                                    uwu -= 1
                                if "#" in sub_uncommon_copy:
                                    sub_uncommon_copy = remove_all(sub_uncommon_copy, '#') # we remove all the # characters that are left
                                sub_uncommon_str_temp[k].append(sub_uncommon_copy if len(sub_uncommon) > 1 else sub_uncommon_copy[0]) # we add the updated uncommon substring to the final list                   
                    sub_uncommon = [] # reset the sequence list
                    tracker_str2 = i - 1 # reset the tracker to the first uncommon string of the new sequence
                    sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon string to the new sequence list
                    uncommon_str_i[l].append("")

                common[index - 1] = "#" # add the # character to the common substring to indicate that an uncommon substring is there
                index -= 1 # move to the previous row
                i  -= 1 # move to the next string
        
        if l == 1: # if the index point to the second string, it means we are dealing with the first string 
            if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left
                sub_uncommon.reverse()
                uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) # add the uncommon substring to the final list
        else: # else it means that we are dealing with the common sentence
            if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left
                if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly
                    sub_uncommon.reverse()
                    for k in range(l):
                        sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
                else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it
                    sub_uncommon.reverse()
                    for k in range(l):
                        sub_uncommon_copy = sub_uncommon.copy()
                        if len(sub_uncommon_copy) < 2: # if the length of the uncommon substring is less than 2, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring
                            sub_uncommon_copy = uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0] if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1]) == list else uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
                        else: # if the length of the uncommon substring is greater than 2, it means that it is a sequence so we need to update it
                            uwu = 1
                            while "#" in sub_uncommon_copy and len(uncommon_str_i[k]) - uwu >= 0: # we loop through the uncommon substring and replace the # character with the uncommon substring
                                if type(uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]) == list :
                                    # we loop through the terms of the sequence that needs to be updated and replace the # character with the uncommon substring
                                    for term in uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]:
                                        if '#' in sub_uncommon_copy:
                                            ind = sub_uncommon_copy.index("#")
                                        sub_uncommon_copy[ind] = term
                                else: # if the last uncommon substring is not a list, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring
                                    ind = sub_uncommon_copy.index("#")
                                    sub_uncommon_copy[ind] = uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
                                uwu += 1
                        
                        
                        if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0]) == list : sub_uncommon_copy = remove_all(sub_uncommon_copy, "#") # we remove all the # characters that are left
                        sub_uncommon_str_temp[k].append(sub_uncommon_copy) # we add the updated uncommon substring to the final list
            # we add the uncommon substring to all the uncommon parts of all the previous strings
            for k in range(l):
                checking = shrink(" ".join(common)).split("#")
                nu = len(checking) - 1
                if temp_sentence[0] == "#":
                    nu += 1
                if len(sub_uncommon_str_temp[k]) < nu:
                    for q in range(0, len(uncommon_str_i[k]) - len(sub_uncommon_str_temp[k])):
                        sub_uncommon_str_temp[k].insert(0, uncommon_str_i[k][q])
                uncommon_str_i[k] = sub_uncommon_str_temp[k]
        
        if i != 0:
            temp_i = i
            sub_uncommon_str2 = [] # reset the sequence list
            while i > 0:
                sub_uncommon_str2.append(temp_sentence[i - 1])
                i -= 1
            sub_uncommon_str2.reverse()
            # add the sequence to the final list
            for k in range(l):
                if temp_i < len(temp_sentence):
                    if temp_sentence[temp_i] == "#":
                        f_unc = uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
                        uncommon_str_i[k].remove(f_unc)
                        sub_uncommon_str2.extend(f_unc)
                uncommon_str_i[k].append(sub_uncommon_str2 if len(sub_uncommon_str2) > 1 else sub_uncommon_str2[0])
                uncommon_str_i[k] = remove_all(uncommon_str_i[k], "#")
            if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]):
                common.insert(0, "#")

        # we add the uncommon substring left to the current string
        if len(sub_uncommon_str) > 0:
            sub_uncommon_str.reverse()
            none_index = get_last(uncommon_str_i[l], "")
            if none_index != -1:
                uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list
            else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
            if len(uncommon_str_i[l]) < len(uncommon_str_i[l - 1]):
                uncommon_str_i[l].append("")

        if j != 0:
            sub_uncommon_str = [] # reset the sequence list
            while len(uncommon_str_i[l]) + 1 > len(uncommon_str_i[l - 1]) and "" in uncommon_str_i[l]:
                uncommon_str_i[l].remove("")
            while j > 0:
                sub_uncommon_str.append(sentences[l][j - 1])
                j -= 1
            sub_uncommon_str.reverse()
            # add the sequence to the final list
            uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
            if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]):
                common.insert(0, "#")

        temp_sentence = remove_all(common.copy(), "") # we update the common sentence

        for rt in range(0, l):
            while len(uncommon_str_i[l]) != len(uncommon_str_i[rt]):
                if len(uncommon_str_i[l]) < len(uncommon_str_i[rt]):
                    uncommon_str_i[l].append("")
                else:
                    uncommon_str_i[rt].append("")

        if len(uncommon_str_i[l]) != len(shrink(" ".join(common)).split("#")) - 1:
            for rt in range(0, l+1):
                if len(uncommon_str_i[rt]) < len(shrink(" ".join(common)).split("#")) - 1:
                    uncommon_str_i[rt].append("")

        # N-gram distribution on the uncommon parts
        uncommon_str_i[0:l+1] = ngram_distribution(uncommon_str_i[0:l+1], shrink(" ".join(temp_sentence)))
        temp_sentence = shrink(" ".join(temp_sentence))

        # update the distribution of the uncommon parts based on the N-gram distribution
        for i in range(len(uncommon_str_i[0]), 0, -1):
            mask = "$ " * len(uncommon_str_i[0][i-1])
            temp_sentence = temp_sentence.replace("#", mask, 1)
        temp_sentence = temp_sentence.replace("$", "#")
        temp_sentence = temp_sentence.split(" ")
        temp_sentence = remove_all(temp_sentence, "")

    # join the common list into a sentence
    common_sentence = " ".join(temp_sentence)
    # replace the # character with the [MASK] token
    common_sentence = common_sentence.replace("#", "[MASK]")

    # reverse the order of the uncommon substring lists
    for i in range(len(uncommon_str_i)):
        uncommon_str_i[i].reverse()

    # return the common sentence and the lists of uncommon substrings
    return common_sentence, uncommon_str_i

def text_mining_algorithm(sentences):
    tokenized_sentences = preprocess(sentences, 1)

    common_words, uncommon_words = common_and_uncommon_extraction(tokenized_sentences)

    return common_words, uncommon_words

def similarity_analysis(masked_sentence, final_uncommon_str, nlp, fill_mask):
    i = 0
    while "[MASK]" in masked_sentence:
        # MLM with BERT
        pred = fill_mask(masked_sentence)
        # Similarity between the masked words and the uncommon words with word embeddings
        #nlp = spacy.load("en_core_web_md")  
        if type(pred[0]) == list:
            df1 = pd.DataFrame(pred[0])
        else:
            df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
        word_list = df1["token_str"].tolist() # Get the list of words from the dataframe
        # Get the list of uncommon words for the current masked word
        strings = []
        for fus in final_uncommon_str:
            strings.append(fus[i])

        # Get the similarity between the masked word and the uncommon words
        similarity = []
        for s in strings:
            similarity.append(np.mean([nlp(w).similarity(nlp(s)) for w in word_list]))

        # Select the uncommon word with the highest similarity
        selected_word = strings[np.argmax(similarity)]
        masked_sentence = masked_sentence.replace("[MASK]", selected_word, 1)
        i += 1
    
    return masked_sentence

def text_combining(texts, nlp, fill_mask):
    masked_sentence, uncommon_words = text_mining_algorithm(texts)

    combined_sentence = similarity_analysis(masked_sentence, flatten(uncommon_words), nlp, fill_mask)

    return combined_sentence

if __name__ == "__main__":
    nlp = spacy.load("en_core_web_md")
    fill_mask = pipeline("fill-mask", model="distilbert-base-uncased")
    sentence1 = "I love to pay my video games in my free time, especially retro video games."
    sentence2 = "I love to play oreo games in my free thyme, especially retro video games."
    sentence3 = "Ay live to slay video vames in my free time, especially utro video games."
    sentences = np.array([[sentence1], [sentence2], [sentence3]])
    print(text_combining(sentences, nlp, fill_mask))