Spaces:

Rakesh30
/

Sentence_Embedding-App

Runtime error

App Files Files Community

Rakesh30 commited on Apr 28, 2023

Commit

bfca98b

1 Parent(s): ebf5a46

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -25

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import pickle
 import os
 from datasets import load_dataset
 embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
-print(embeddings_a[:5])
 dataset = load_dataset("SandipPalit/Movie_Dataset")
 from InstructorEmbedding import INSTRUCTOR
 model = INSTRUCTOR('hkunlp/instructor-xl')
@@ -16,21 +15,6 @@ def getSimilarity(sentences_a,sentences_b):
   similarities = cosine_similarity(embeddings_a,embeddings_b)
   return similarities
-#get the indices of the np_array that has maximum score
-import heapq
-def get_top_k(h,k):
-  output=[]
-  for i in range(k):
-     output.append(heapq.heappop(h)[1])
-  return output
-def heapsort(np_array,k):
-  h=[]
-  for idx,score in enumerate(np_array):
-    heapq.heappush(h,(-score,idx))                    #max_heap
-  return get_top_k(h,k)
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
@@ -57,22 +41,34 @@ def get_pre_processed_data(size):
      sentences.extend(preprocess(idx,x,df.shape[0]))
   return sentences
 def get_top_k_matches(np_array,k,sentences):
-   indices=[]
-   for idx in heapsort(np_array,k):
-     try:
-        i=len(sentences[idx])-1                  #based on the index find the sentence- reason for storing idx but not sentence
-     except:
-       print(idx)
      count=1
      number=0
-     while sentences[idx][i]!='@':                     #o(8-10 digits)- o(1)
        number=number+count*int(sentences[idx][i])
        count*=10
        i-=1
-     indices.append(number)
-   #print(indices)
    return indices

 import os
 from datasets import load_dataset
 embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
 dataset = load_dataset("SandipPalit/Movie_Dataset")
 from InstructorEmbedding import INSTRUCTOR
 model = INSTRUCTOR('hkunlp/instructor-xl')
   similarities = cosine_similarity(embeddings_a,embeddings_b)
   return similarities
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize, sent_tokenize
      sentences.extend(preprocess(idx,x,df.shape[0]))
   return sentences
+#building_the_max_heap
+def heapsort(np_array,k):
+  h=[]
+  for idx,score in enumerate(np_array):
+    heapq.heappush(h,(-score,idx))                    #max_heap
+  return h
+#return the id's of the movie
 def get_top_k_matches(np_array,k,sentences):
+   indices=set()
+   h=heapsort(np_array,k)
+   visited=set()
+   indices=[]
+   while h and len(indices)!=k:
+     score,idx=heapq.heappop(h)
+     i=len(sentences[idx])-1                  #based on the index find the sentence- reason for storing idx but not sentence
      count=1
      number=0
+     while sentences[idx][i]!='@':                     #O(8-10 digits) i.e O(1) time
        number=number+count*int(sentences[idx][i])
        count*=10
        i-=1
+     if number not in visited:                #duplicate ids are not added, mainting 2 arrays is to maintian the order
+       indices.append(number)
+       visited.add(number)
    return indices