ritikaaA commited on
Commit
512cd48
·
verified ·
1 Parent(s): c4d80ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -8
app.py CHANGED
@@ -135,13 +135,14 @@ chunk_embeddings = create_embeddings(brand_chunks)
135
 
136
  # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
137
  def get_top_chunks(query, chunk_embeddings, text_chunks):
 
 
138
  # Convert the query text into a vector embedding
139
  query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
140
 
141
  # Normalize the query embedding to unit length for accurate similarity comparison. Normalize = bring to a length of 1
142
  query_embedding_normalized = query_embedding / query_embedding.norm()
143
 
144
- # Normalize all chunk embeddings to unit length for consistent comparison
145
  # chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
146
  if chunk_embeddings.ndim == 1:
147
  chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm()
@@ -154,22 +155,38 @@ def get_top_chunks(query, chunk_embeddings, text_chunks):
154
  print(similarities)
155
 
156
  # Find the indices of the 3 chunks with highest similarity scores
157
- top_indices = torch.topk(similarities, k= min(7, len(text_chunks))).indices
158
-
159
  # Print the top indices
160
  print(top_indices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  # Create an empty list to store the most relevant chunks
163
- top_chunks = []
164
 
165
  # Loop through the top indices and retrieve the corresponding text chunks
166
- for i in top_indices:
167
- relevant_info = brand_chunks[i]
168
- top_chunks.append(relevant_info)
169
 
170
 
171
  # Return the list of most relevant chunks
172
- return top_chunks
173
 
174
  # theme
175
  custom_theme = gr.themes.Soft(
 
135
 
136
  # Define a function to find the most relevant text chunks for a given query, chunk_embeddings, and text_chunks
137
  def get_top_chunks(query, chunk_embeddings, text_chunks):
138
+ if not text_chunks or chunk_embeddings is None or chunk_embeddings.size(0) == 0:
139
+ return []
140
  # Convert the query text into a vector embedding
141
  query_embedding = model.encode(query, convert_to_tensor=True) # Complete this line
142
 
143
  # Normalize the query embedding to unit length for accurate similarity comparison. Normalize = bring to a length of 1
144
  query_embedding_normalized = query_embedding / query_embedding.norm()
145
 
 
146
  # chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm(dim=1, keepdim=True)
147
  if chunk_embeddings.ndim == 1:
148
  chunk_embeddings_normalized = chunk_embeddings / chunk_embeddings.norm()
 
155
  print(similarities)
156
 
157
  # Find the indices of the 3 chunks with highest similarity scores
158
+ top_indices = torch.topk(similarities, k= min(3, len(text_chunks))).indices
159
+ candidate_chunks = [(i.item(), similarities[i].item()) for i in top_indices]
160
  # Print the top indices
161
  print(top_indices)
162
+ filtered_chunks = [(idx, score) for idx, score in candidate_chunks if score >= similarity_threshold]
163
+
164
+ def keyword_score(chunk_text, query_text):
165
+ q_words = set(query_text.lower().split())
166
+ c_words = set(chunk_text.lower().split())
167
+ return len(q_words & c_words)
168
+
169
+ reranked = sorted(
170
+ filtered_chunks,
171
+ key=lambda x: keyword_score(text_chunks[x[0]], query),
172
+ reverse=True
173
+ )
174
+
175
+ final_chunks = [text_chunks[idx] for idx, _ in reranked]
176
+
177
+ return final_chunks
178
 
179
  # Create an empty list to store the most relevant chunks
180
+ # top_chunks = []
181
 
182
  # Loop through the top indices and retrieve the corresponding text chunks
183
+ # for i in top_indices:
184
+ # relevant_info = brand_chunks[i]
185
+ # top_chunks.append(relevant_info)
186
 
187
 
188
  # Return the list of most relevant chunks
189
+ # return top_chunks
190
 
191
  # theme
192
  custom_theme = gr.themes.Soft(