Spaces:

Rakesh30
/

Sentence_Embedding-App

Runtime error

App Files Files Community

Sentence_Embedding-App / app.py

Rakesh30

Update app.py

e071757 almost 3 years ago

raw

history blame contribute delete

3.07 kB

	import gradio as gr
	import pickle
	import os
	from datasets import load_dataset
	from gradio.components import Label
	from InstructorEmbedding import INSTRUCTOR
	import heapq
	from sklearn.metrics.pairwise import cosine_similarity
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.stem import WordNetLemmatizer
	import pandas as pd

	dataset = load_dataset("SandipPalit/Movie_Dataset")

	model = INSTRUCTOR('hkunlp/instructor-xl')

	def getSimilarity(sentences_a,sentences_b):
	embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
	embeddings_b = model.encode(sentences_b)
	similarities = cosine_similarity(embeddings_a,embeddings_b)
	return similarities



	nltk.download('punkt')
	nltk.download('stopwords')
	nltk.download('wordnet')

	def preprocess(idx,text,total_length):
	sentences = sent_tokenize(text)
	stop_words = set(stopwords.words('english'))
	lemmatizer = WordNetLemmatizer()

	padding=''+'0'*(len(str(total_length))-len(str(idx)))
	output=[]
	for sentence in sentences:
	output.append(' '.join([lemmatizer.lemmatize(word) for word in sentence.split() if word not in stop_words])+'@'+padding+str(idx))
	return output

	def get_pre_processed_data(size):
	sentences=[]
	for idx,x in enumerate(df['Plot'].head(size).tolist()):
	sentences.extend(preprocess(idx,x,df.shape[0]))
	return sentences

	#building_the_max_heap
	def heapsort(np_array,k):
	h=[]
	for idx,score in enumerate(np_array):
	heapq.heappush(h,(-score,idx)) #max_heap
	return h


	#return the id's of the movie
	def get_top_k_matches(np_array,k,sentences):
	indices=set()
	h=heapsort(np_array,k)

	visited=set()
	indices=[]
	while h and len(indices)!=k:
	score,idx=heapq.heappop(h)
	i=len(sentences[idx])-1 #based on the index find the sentence- reason for storing idx but not sentence
	count=1
	number=0
	while sentences[idx][i]!='@': #O(8-10 digits) i.e O(1) time
	number=number+count*int(sentences[idx][i])
	count*=10
	i-=1

	if number not in visited: #duplicate ids are not added, mainting 2 arrays is to maintian the order
	indices.append(number)
	visited.add(number)
	return indices



	df=pd.DataFrame({"Title":dataset['train']['Title'],"Plot":dataset['train']['Overview']})

	def getOutput(text, size=1000):
	sentences=get_pre_processed_data(int(size))
	np_array=getSimilarity(sentences,[text])

	output=[]
	for idx in get_top_k_matches(np_array,5,sentences):
	output.append("title = "+df.iloc[idx]['Title']+" "*5+" Plot = "+df.iloc[idx]['Plot'])
	return output
	iface = gr.Interface(fn=getOutput,
	inputs=[gr.inputs.Textbox(label="Text")],
	outputs=[Label() for i in range(5)],
	examples=[['After doing the list of experiments A mad scientist declares himself as the god'],["Three men fight for the girl's love"]]
	)
	iface.launch(debug=True)