Spaces:

Galiess
/

AdvancedGenius

Runtime error

App Files Files Community

AdvancedGenius / combining.py

Galiess

Update combining.py

cc2112a over 2 years ago

raw

history blame contribute delete

26.5 kB

	import numpy as np
	import nltk
	import re
	import spacy
	import pandas as pd
	from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
	from nltk.util import ngrams

	import transformers
	from transformers import pipeline
	from transformers import logging
	logging.set_verbosity_error()

	import warnings
	warnings.filterwarnings('ignore')

	# Remove punctuation
	def preprocess(sentences, n):
	new_sentences = []
	for sentence in sentences:
	sentence = sentence[0]
	sentence = sentence.lower()
	if n == 1:
	sentence = nltk.RegexpTokenizer(r'\w+').tokenize(sentence)
	new_sentences.append(sentence)
	else :
	sentence = re.sub(r'[^\w\s]', '', sentence)
	new_sentences.append([sentence])
	if n == 1:
	return new_sentences
	else:
	return np.array(new_sentences)

	# get the length of the smallest n gram
	def get_gram_lentgh(uncommon_str_i):
	lens = []
	for i in range(len(uncommon_str_i[0])):
	temp = []
	for j in range(len(uncommon_str_i)):
	temp.append(len(uncommon_str_i[j][i]) if type(uncommon_str_i[j][i]) == list else 1)
	lens.append(min(temp))
	return lens

	# get the original sentence in a vector form
	def get_og_sentence_vector(uncommon_str, common_sentence):
	og_sentence_vector = []
	temp = common_sentence.split()
	i = 0
	for t in temp:
	if t == "#":
	if type(uncommon_str[i]) == list:
	og_sentence_vector.extend(uncommon_str[i])
	else:
	og_sentence_vector.append(uncommon_str[i])
	i += 1
	else:
	og_sentence_vector.append(t)
	return og_sentence_vector

	def init_list_of_lists(lenght):
	list_of_lists = []
	for i in range(lenght):
	list_of_lists.append([])
	return list_of_lists

	# remove all the occourences of a value in a list
	def remove_all(liste, value):
	while value in liste:
	liste.remove(value)
	return liste

	def ngram_distribution(uncommon_str_i, common_sentence):
	# Initialize the list of lists that will contain the n-grams
	final_uncommon_str_i = init_list_of_lists(len(uncommon_str_i))

	nb_unc_str = 0

	lens = get_gram_lentgh(uncommon_str_i) # get the length of the smallest n grams

	for uncommon_str in uncommon_str_i:
	for i in range(len(uncommon_str)):
	# Make a copy of the current list of the current uncommon part for string 1
	unc_str = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]]
	og_sentence = get_og_sentence_vector(uncommon_str, common_sentence)
	temp_uncommon = uncommon_str[i].copy() if type(uncommon_str[i]) == list else [uncommon_str[i]]
	while len(unc_str) > lens[i]:

	bigram_measures = BigramAssocMeasures()

	# Variable containing the common words that won't allowed in the bigrams
	common_words_str = list(set(og_sentence) - set(unc_str))

	# Generate a list of all n-grams of size n for the sentence
	n_grams_str = list(ngrams(og_sentence, 2))

	# Use the bigram collocation finder to get the best bigrams for the sentence
	finder_str = BigramCollocationFinder.from_words(og_sentence)
	best_bigrams_str = finder_str.nbest(bigram_measures.pmi, len(n_grams_str))

	# Filter out bigrams that contain common words from the current list of uncommon words
	best_uncommon_ngrams_str = [ngram for ngram in best_bigrams_str if (not any(p_ngrams in ngram for p_ngrams in common_words_str))]

	# Generate the final list of uncommon n-grams for string 1 by filtering the filtered bigrams and remaining uncommon words
	uncommon_ngrams_str = [''] * len(unc_str)
	count1 = len(unc_str)
	count2 = 0
	# We loop through the best uncommon n-grams and check if they are in the uncommon words list
	for b in best_uncommon_ngrams_str:
	if b[0] in unc_str and b[1] in unc_str: # if both words are in the uncommon words list
	uncommon_ngrams_str[unc_str.index(b[0])] = " ".join(list(b)) # we add the n-gram to the final list
	count2 += 1 # we increment the number of uncommon n-grams in the final list
	# we remove the words of the bi-gram from the uncommon words list
	unc_str[unc_str.index(b[0])] = ''
	unc_str[unc_str.index(b[1])] = ''
	count1 -= 2 # we decrement the number of uncommon words in the uncommon words list
	if count1 + count2 == lens[i]: # if we have the number of uncommon n-grams we want
	break
	if unc_str != [""] * len(unc_str): # if there are still uncommon words left
	for j in range(len(unc_str)):
	if unc_str[j] != '':
	uncommon_ngrams_str[j] = unc_str[j] # we add the uncommon words left to the final list
	uncommon_ngrams_str = remove_all(uncommon_ngrams_str, '') # we remove the empty strings from the final list
	unc_str = uncommon_ngrams_str.copy() # we update the current list of uncommon words
	og_sentence = unc_str.copy() # we update the current list of uncommon words

	final_uncommon_str_i[nb_unc_str].append(unc_str) # we add the final list of uncommon n-grams to the final list of lists
	nb_unc_str += 1 # we increment the number of uncommon parts
	return final_uncommon_str_i

	# Reduce the sequences of # into one #
	def shrink(sentence):
	temp = sentence.split()
	b = False
	for i in range(len(temp)):
	if temp[i] == "#" and b:
	temp[i] = ""
	elif temp[i] == "#" and not b:
	b = True
	elif temp[i] != "#" and b:
	b = False
	while "" in temp:
	temp.remove("")

	return " ".join(temp)

	def flatten(final_uncommon_str):
	flatten_final_uncommon_str = []
	for i in range(len(final_uncommon_str)):
	flatten_final_uncommon_str.append([item for sublist in final_uncommon_str[i] for item in sublist])
	return flatten_final_uncommon_str

	# Init the Dynamic matrix
	def init_matrix(temp_sentence, sentences, lenght, l):
	# initialize the L matrix with zeros
	L = [[0] * (lenght + 1) for _ in range(len(temp_sentence) + 1)]

	# fill in the L matrix using dynamic programming
	for i in range(len(temp_sentence) + 1):
	for j in range(lenght + 1):
	# if either string is empty, the longest common substring is zero
	if i == 0 or j == 0:
	L[i][j] = 0
	# if the characters match, add one to the length of the longest common substring
	elif temp_sentence[i - 1] == sentences[l][j - 1]:
	L[i][j] = L[i - 1][j - 1] + 1
	# if the characters don't match, take the maximum length from the previous row or column
	else:
	L[i][j] = max(L[i - 1][j], L[i][j - 1])
	return L

	# init list of lists
	def init_list_of_lists(lenght):
	list_of_lists = []
	for i in range(lenght):
	list_of_lists.append([])
	return list_of_lists

	# remove all the occourences of a value in a list
	def remove_all(liste, value):
	while value in liste:
	liste.remove(value)
	return liste

	# get last occurence of an element in a list
	def get_last(liste, element):
	rev_list = liste.copy()
	rev_list.reverse()
	if element in rev_list:
	index = rev_list.index(element)
	return len(liste) - index - 1
	else : return -1


	def common_and_uncommon_extraction(sentences):
	lens = [len(s) for s in sentences]

	# initialize the uncommon substring lists
	uncommon_str_i = init_list_of_lists(len(sentences))

	temp_sentence = sentences[0]
	for l in range(1, len(sentences)):
	# initialize the L matrix
	L = init_matrix(temp_sentence, sentences, lens[l], l)

	# calculate the index based on the length of the longer string
	index = len(temp_sentence) + lens[l]

	# initialize the common list with empty strings
	common = [""] * (index + 1)
	common[index] = ""

	# set i and j to the end of each string
	i = len(temp_sentence)
	j = lens[l]
	limit = abs(i - j)

	# trackers to follow the uncommon substrings position
	tracker_str1 = -1
	tracker_str2 = -1
	# lists that save a sequence of uncommon substrings
	sub_uncommon_str = []
	sub_uncommon = []
	# final list that contains all the uncommon substrings
	sub_uncommon_str_i_temp = []
	sub_uncommon_str_temp = init_list_of_lists(len(sentences))

	# loop through the L matrix to find the common and uncommon substrings
	while i > 0 and j > 0:

	# if the characters match, add the character to the common list and move to the previous diagonal cell
	dist = abs(i - j)
	if temp_sentence[i - 1] == sentences[l][j - 1] and dist <= limit:
	common[index - 1] = temp_sentence[i - 1]
	i -= 1
	j -= 1
	index -= 1
	# if the length of the substring from the previous column is greater, add the uncommon character to uncommon_str list and move to the previous column
	elif L[i - 1][j] < L[i][j - 1]:
	if tracker_str1 == -1: # if the tracker is -1, it means that the substring is the first one
	tracker_str1 = j - 1
	sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the list
	elif tracker_str1 == j: # if the tracker is equal to the current index, it means that the substring is part of the same sequence
	sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon character to the sequence list
	tracker_str1 = j - 1
	else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence
	sub_uncommon_str.reverse()
	# add the sequence to the final list
	none_index = get_last(uncommon_str_i[l], "")
	if none_index != -1:
	uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list
	else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
	sub_uncommon_str = [] # reset the sequence list
	tracker_str1 = j - 1 # reset the tracker to the first uncommon string of the new sequence
	sub_uncommon_str.append(sentences[l][j - 1]) # add the uncommon string to the new sequence list

	j -= 1 # move to the previous column
	common[index - 1] = "#"
	index -= 1
	# if the length of the substring from the previous row is greater, add the uncommon character to uncommon_str2 list and move to the previous row
	else:
	if tracker_str2 == -1: # if the tracker is -1, it means that the substring is the first one
	tracker_str2 = i - 1
	sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the list
	elif tracker_str2 == i: # if the tracker is equal to the current index, it means that the substring is part of the same sequence
	sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon character to the sequence list
	tracker_str2 = i - 1
	else: # if the tracker is not equal to the current index, it means that the substring is part of a different sequence
	sub_uncommon.reverse()
	if l == 1: # if the index point to the second string, it means we are dealing with the first string so we add the sequence to the final list
	uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
	else: # else it means that we are dealing with the common sentence
	if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly
	#sub_uncommon.reverse()
	# we add the uncommon substring to all the uncommon parts of all the previous strings
	for k in range(l):
	sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
	else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it
	sub_uncommon_copy = sub_uncommon.copy()
	# we add the uncommon substring to a temp list to not mess up the order of the final list
	sub_uncommon_str_i_temp.append(sub_uncommon_copy if len(sub_uncommon_copy) > 1 else sub_uncommon_copy[0])
	for k in range(l):
	sub_uncommon_copy = sub_uncommon.copy()
	uwu = 1
	while "#" in sub_uncommon_copy and len(sub_uncommon_str_i_temp) - uwu < len(uncommon_str_i[k]): # we loop through the uncommon substring and replace the # character with the uncommon substring
	# we get the last uncommon substring of the previous string
	updated_uncommon_str = uncommon_str_i[k][len(sub_uncommon_str_i_temp) - uwu]
	if type(updated_uncommon_str) == list: # if the last uncommon substring is a list, it means that it is a sequence so we need to update it
	owo = len(updated_uncommon_str) - 1
	while owo >= 0: # we loop through the sequence and replace the # character with the uncommon substring
	if '#' in sub_uncommon_copy:
	ind = max(loc for loc, val in enumerate(sub_uncommon_copy) if val == '#')
	sub_uncommon_copy[ind] = updated_uncommon_str[owo]
	owo -= 1
	else:
	ind = sub_uncommon_copy.index("#")
	sub_uncommon_copy[ind] = updated_uncommon_str
	uwu -= 1
	if "#" in sub_uncommon_copy:
	sub_uncommon_copy = remove_all(sub_uncommon_copy, '#') # we remove all the # characters that are left
	sub_uncommon_str_temp[k].append(sub_uncommon_copy if len(sub_uncommon) > 1 else sub_uncommon_copy[0]) # we add the updated uncommon substring to the final list
	sub_uncommon = [] # reset the sequence list
	tracker_str2 = i - 1 # reset the tracker to the first uncommon string of the new sequence
	sub_uncommon.append(temp_sentence[i - 1]) # add the uncommon string to the new sequence list
	uncommon_str_i[l].append("")

	common[index - 1] = "#" # add the # character to the common substring to indicate that an uncommon substring is there
	index -= 1 # move to the previous row
	i -= 1 # move to the next string

	if l == 1: # if the index point to the second string, it means we are dealing with the first string
	if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left
	sub_uncommon.reverse()
	uncommon_str_i[0].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0]) # add the uncommon substring to the final list
	else: # else it means that we are dealing with the common sentence
	if len(sub_uncommon) > 0: # if the length of the substring is greater than 0, it means that there is an uncommon substring left
	if '#' not in sub_uncommon: # if the sequence doesn't contain the # character, it means it is a new sequence so we add it to the final list directly
	sub_uncommon.reverse()
	for k in range(l):
	sub_uncommon_str_temp[k].append(sub_uncommon if len(sub_uncommon) > 1 else sub_uncommon[0])
	else: # if the sequence contains the # character, it means that it is a sequence that is part of a previous sequence so we need to update it
	sub_uncommon.reverse()
	for k in range(l):
	sub_uncommon_copy = sub_uncommon.copy()
	if len(sub_uncommon_copy) < 2: # if the length of the uncommon substring is less than 2, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring
	sub_uncommon_copy = uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0] if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1]) == list else uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
	else: # if the length of the uncommon substring is greater than 2, it means that it is a sequence so we need to update it
	uwu = 1
	while "#" in sub_uncommon_copy and len(uncommon_str_i[k]) - uwu >= 0: # we loop through the uncommon substring and replace the # character with the uncommon substring
	if type(uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]) == list :
	# we loop through the terms of the sequence that needs to be updated and replace the # character with the uncommon substring
	for term in uncommon_str_i[k][len(uncommon_str_i[k]) - uwu]:
	if '#' in sub_uncommon_copy:
	ind = sub_uncommon_copy.index("#")
	sub_uncommon_copy[ind] = term
	else: # if the last uncommon substring is not a list, it means that it is a sequence of a single string so we just replace the # character with the uncommon substring
	ind = sub_uncommon_copy.index("#")
	sub_uncommon_copy[ind] = uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
	uwu += 1


	if type(uncommon_str_i[k][len(uncommon_str_i[k]) - 1][0]) == list : sub_uncommon_copy = remove_all(sub_uncommon_copy, "#") # we remove all the # characters that are left
	sub_uncommon_str_temp[k].append(sub_uncommon_copy) # we add the updated uncommon substring to the final list
	# we add the uncommon substring to all the uncommon parts of all the previous strings
	for k in range(l):
	checking = shrink(" ".join(common)).split("#")
	nu = len(checking) - 1
	if temp_sentence[0] == "#":
	nu += 1
	if len(sub_uncommon_str_temp[k]) < nu:
	for q in range(0, len(uncommon_str_i[k]) - len(sub_uncommon_str_temp[k])):
	sub_uncommon_str_temp[k].insert(0, uncommon_str_i[k][q])
	uncommon_str_i[k] = sub_uncommon_str_temp[k]

	if i != 0:
	temp_i = i
	sub_uncommon_str2 = [] # reset the sequence list
	while i > 0:
	sub_uncommon_str2.append(temp_sentence[i - 1])
	i -= 1
	sub_uncommon_str2.reverse()
	# add the sequence to the final list
	for k in range(l):
	if temp_i < len(temp_sentence):
	if temp_sentence[temp_i] == "#":
	f_unc = uncommon_str_i[k][len(uncommon_str_i[k]) - 1]
	uncommon_str_i[k].remove(f_unc)
	sub_uncommon_str2.extend(f_unc)
	uncommon_str_i[k].append(sub_uncommon_str2 if len(sub_uncommon_str2) > 1 else sub_uncommon_str2[0])
	uncommon_str_i[k] = remove_all(uncommon_str_i[k], "#")
	if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]):
	common.insert(0, "#")

	# we add the uncommon substring left to the current string
	if len(sub_uncommon_str) > 0:
	sub_uncommon_str.reverse()
	none_index = get_last(uncommon_str_i[l], "")
	if none_index != -1:
	uncommon_str_i[l][none_index] = sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0] # add the uncommon string to the new sequence list
	else : uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
	if len(uncommon_str_i[l]) < len(uncommon_str_i[l - 1]):
	uncommon_str_i[l].append("")

	if j != 0:
	sub_uncommon_str = [] # reset the sequence list
	while len(uncommon_str_i[l]) + 1 > len(uncommon_str_i[l - 1]) and "" in uncommon_str_i[l]:
	uncommon_str_i[l].remove("")
	while j > 0:
	sub_uncommon_str.append(sentences[l][j - 1])
	j -= 1
	sub_uncommon_str.reverse()
	# add the sequence to the final list
	uncommon_str_i[l].append(sub_uncommon_str if len(sub_uncommon_str) > 1 else sub_uncommon_str[0])
	if common[0] != "#" and len(shrink(" ".join(common)).split("#")) < len(uncommon_str_i[0]):
	common.insert(0, "#")

	temp_sentence = remove_all(common.copy(), "") # we update the common sentence

	for rt in range(0, l):
	while len(uncommon_str_i[l]) != len(uncommon_str_i[rt]):
	if len(uncommon_str_i[l]) < len(uncommon_str_i[rt]):
	uncommon_str_i[l].append("")
	else:
	uncommon_str_i[rt].append("")

	if len(uncommon_str_i[l]) != len(shrink(" ".join(common)).split("#")) - 1:
	for rt in range(0, l+1):
	if len(uncommon_str_i[rt]) < len(shrink(" ".join(common)).split("#")) - 1:
	uncommon_str_i[rt].append("")

	# N-gram distribution on the uncommon parts
	uncommon_str_i[0:l+1] = ngram_distribution(uncommon_str_i[0:l+1], shrink(" ".join(temp_sentence)))
	temp_sentence = shrink(" ".join(temp_sentence))

	# update the distribution of the uncommon parts based on the N-gram distribution
	for i in range(len(uncommon_str_i[0]), 0, -1):
	mask = "$ " * len(uncommon_str_i[0][i-1])
	temp_sentence = temp_sentence.replace("#", mask, 1)
	temp_sentence = temp_sentence.replace("$", "#")
	temp_sentence = temp_sentence.split(" ")
	temp_sentence = remove_all(temp_sentence, "")

	# join the common list into a sentence
	common_sentence = " ".join(temp_sentence)
	# replace the # character with the [MASK] token
	common_sentence = common_sentence.replace("#", "[MASK]")

	# reverse the order of the uncommon substring lists
	for i in range(len(uncommon_str_i)):
	uncommon_str_i[i].reverse()

	# return the common sentence and the lists of uncommon substrings
	return common_sentence, uncommon_str_i

	def text_mining_algorithm(sentences):
	tokenized_sentences = preprocess(sentences, 1)

	common_words, uncommon_words = common_and_uncommon_extraction(tokenized_sentences)

	return common_words, uncommon_words

	def similarity_analysis(masked_sentence, final_uncommon_str, nlp, fill_mask):
	i = 0
	while "[MASK]" in masked_sentence:
	# MLM with BERT
	pred = fill_mask(masked_sentence)
	# Similarity between the masked words and the uncommon words with word embeddings
	#nlp = spacy.load("en_core_web_md")
	if type(pred[0]) == list:
	df1 = pd.DataFrame(pred[0])
	else:
	df1 = pd.DataFrame(pred) # Convert the prediction to a dataframe
	word_list = df1["token_str"].tolist() # Get the list of words from the dataframe
	# Get the list of uncommon words for the current masked word
	strings = []
	for fus in final_uncommon_str:
	strings.append(fus[i])

	# Get the similarity between the masked word and the uncommon words
	similarity = []
	for s in strings:
	similarity.append(np.mean([nlp(w).similarity(nlp(s)) for w in word_list]))

	# Select the uncommon word with the highest similarity
	selected_word = strings[np.argmax(similarity)]
	masked_sentence = masked_sentence.replace("[MASK]", selected_word, 1)
	i += 1

	return masked_sentence

	def text_combining(texts, nlp, fill_mask):
	masked_sentence, uncommon_words = text_mining_algorithm(texts)

	combined_sentence = similarity_analysis(masked_sentence, flatten(uncommon_words), nlp, fill_mask)

	return combined_sentence

	if __name__ == "__main__":
	nlp = spacy.load("en_core_web_md")
	fill_mask = pipeline("fill-mask", model="distilbert-base-uncased")
	sentence1 = "I love to pay my video games in my free time, especially retro video games."
	sentence2 = "I love to play oreo games in my free thyme, especially retro video games."
	sentence3 = "Ay live to slay video vames in my free time, especially utro video games."
	sentences = np.array([[sentence1], [sentence2], [sentence3]])
	print(text_combining(sentences, nlp, fill_mask))