Spaces:
Runtime error
Runtime error
File size: 3,071 Bytes
8d3df09 7d3f085 d051ccf 8d3df09 23228a6 8d3df09 b63d1e9 8d3df09 23228a6 8d3df09 dc588d4 8d3df09 23228a6 8d3df09 bfca98b 8d3df09 bfca98b 55ab30c bfca98b 8d3df09 bfca98b 8d3df09 bfca98b 8d3df09 23228a6 8d3df09 d9d1320 8d3df09 493d46f ba3e6d5 e071757 493d46f 4b7d789 8d3df09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import gradio as gr
import pickle
import os
from datasets import load_dataset
from gradio.components import Label
from InstructorEmbedding import INSTRUCTOR
import heapq
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd
dataset = load_dataset("SandipPalit/Movie_Dataset")
model = INSTRUCTOR('hkunlp/instructor-xl')
def getSimilarity(sentences_a,sentences_b):
embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
embeddings_b = model.encode(sentences_b)
similarities = cosine_similarity(embeddings_a,embeddings_b)
return similarities
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def preprocess(idx,text,total_length):
sentences = sent_tokenize(text)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
padding=''+'0'*(len(str(total_length))-len(str(idx)))
output=[]
for sentence in sentences:
output.append(' '.join([lemmatizer.lemmatize(word) for word in sentence.split() if word not in stop_words])+'@'+padding+str(idx))
return output
def get_pre_processed_data(size):
sentences=[]
for idx,x in enumerate(df['Plot'].head(size).tolist()):
sentences.extend(preprocess(idx,x,df.shape[0]))
return sentences
#building_the_max_heap
def heapsort(np_array,k):
h=[]
for idx,score in enumerate(np_array):
heapq.heappush(h,(-score,idx)) #max_heap
return h
#return the id's of the movie
def get_top_k_matches(np_array,k,sentences):
indices=set()
h=heapsort(np_array,k)
visited=set()
indices=[]
while h and len(indices)!=k:
score,idx=heapq.heappop(h)
i=len(sentences[idx])-1 #based on the index find the sentence- reason for storing idx but not sentence
count=1
number=0
while sentences[idx][i]!='@': #O(8-10 digits) i.e O(1) time
number=number+count*int(sentences[idx][i])
count*=10
i-=1
if number not in visited: #duplicate ids are not added, mainting 2 arrays is to maintian the order
indices.append(number)
visited.add(number)
return indices
df=pd.DataFrame({"Title":dataset['train']['Title'],"Plot":dataset['train']['Overview']})
def getOutput(text, size=1000):
sentences=get_pre_processed_data(int(size))
np_array=getSimilarity(sentences,[text])
output=[]
for idx in get_top_k_matches(np_array,5,sentences):
output.append("title = "+df.iloc[idx]['Title']+" "*5+" Plot = "+df.iloc[idx]['Plot'])
return output
iface = gr.Interface(fn=getOutput,
inputs=[gr.inputs.Textbox(label="Text")],
outputs=[Label() for i in range(5)],
examples=[['After doing the list of experiments A mad scientist declares himself as the god'],["Three men fight for the girl's love"]]
)
iface.launch(debug=True)
|