Rakesh30 commited on
Commit
bfca98b
·
1 Parent(s): ebf5a46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -25
app.py CHANGED
@@ -3,7 +3,6 @@ import pickle
3
  import os
4
  from datasets import load_dataset
5
  embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
6
- print(embeddings_a[:5])
7
  dataset = load_dataset("SandipPalit/Movie_Dataset")
8
  from InstructorEmbedding import INSTRUCTOR
9
  model = INSTRUCTOR('hkunlp/instructor-xl')
@@ -16,21 +15,6 @@ def getSimilarity(sentences_a,sentences_b):
16
  similarities = cosine_similarity(embeddings_a,embeddings_b)
17
  return similarities
18
 
19
- #get the indices of the np_array that has maximum score
20
- import heapq
21
- def get_top_k(h,k):
22
- output=[]
23
- for i in range(k):
24
- output.append(heapq.heappop(h)[1])
25
- return output
26
-
27
- def heapsort(np_array,k):
28
- h=[]
29
- for idx,score in enumerate(np_array):
30
- heapq.heappush(h,(-score,idx)) #max_heap
31
- return get_top_k(h,k)
32
-
33
-
34
  import nltk
35
  from nltk.corpus import stopwords
36
  from nltk.tokenize import word_tokenize, sent_tokenize
@@ -57,22 +41,34 @@ def get_pre_processed_data(size):
57
  sentences.extend(preprocess(idx,x,df.shape[0]))
58
  return sentences
59
 
 
 
 
 
 
 
 
 
 
60
  def get_top_k_matches(np_array,k,sentences):
61
- indices=[]
 
62
 
63
- for idx in heapsort(np_array,k):
64
- try:
65
- i=len(sentences[idx])-1 #based on the index find the sentence- reason for storing idx but not sentence
66
- except:
67
- print(idx)
68
  count=1
69
  number=0
70
- while sentences[idx][i]!='@': #o(8-10 digits)- o(1)
71
  number=number+count*int(sentences[idx][i])
72
  count*=10
73
  i-=1
74
- indices.append(number)
75
- #print(indices)
 
 
76
  return indices
77
 
78
 
 
3
  import os
4
  from datasets import load_dataset
5
  embeddings_a = pickle.load(open(os.getcwd()+"/temp.pkl",'rb'))
 
6
  dataset = load_dataset("SandipPalit/Movie_Dataset")
7
  from InstructorEmbedding import INSTRUCTOR
8
  model = INSTRUCTOR('hkunlp/instructor-xl')
 
15
  similarities = cosine_similarity(embeddings_a,embeddings_b)
16
  return similarities
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  import nltk
19
  from nltk.corpus import stopwords
20
  from nltk.tokenize import word_tokenize, sent_tokenize
 
41
  sentences.extend(preprocess(idx,x,df.shape[0]))
42
  return sentences
43
 
44
+ #building_the_max_heap
45
+ def heapsort(np_array,k):
46
+ h=[]
47
+ for idx,score in enumerate(np_array):
48
+ heapq.heappush(h,(-score,idx)) #max_heap
49
+ return h
50
+
51
+
52
+ #return the id's of the movie
53
  def get_top_k_matches(np_array,k,sentences):
54
+ indices=set()
55
+ h=heapsort(np_array,k)
56
 
57
+ visited=set()
58
+ indices=[]
59
+ while h and len(indices)!=k:
60
+ score,idx=heapq.heappop(h)
61
+ i=len(sentences[idx])-1 #based on the index find the sentence- reason for storing idx but not sentence
62
  count=1
63
  number=0
64
+ while sentences[idx][i]!='@': #O(8-10 digits) i.e O(1) time
65
  number=number+count*int(sentences[idx][i])
66
  count*=10
67
  i-=1
68
+
69
+ if number not in visited: #duplicate ids are not added, mainting 2 arrays is to maintian the order
70
+ indices.append(number)
71
+ visited.add(number)
72
  return indices
73
 
74