Spaces:
Build error
Build error
Update utils/vector_store.py
Browse files- utils/vector_store.py +20 -35
utils/vector_store.py
CHANGED
|
@@ -112,47 +112,32 @@ class VectorStore:
|
|
| 112 |
|
| 113 |
return chunks
|
| 114 |
|
| 115 |
-
def similarity_search(
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
"""Enhanced similarity search with filtering and re-ranking."""
|
| 123 |
-
# Encode query
|
| 124 |
query_vector = self.model.encode(query, convert_to_tensor=True)
|
| 125 |
-
|
| 126 |
-
# Calculate similarities and filter results
|
| 127 |
results = []
|
|
|
|
| 128 |
for doc in self.vectors:
|
| 129 |
-
# Apply filters if specified
|
| 130 |
-
if filter_criteria:
|
| 131 |
-
skip = False
|
| 132 |
-
for key, values in filter_criteria.items():
|
| 133 |
-
doc_value = self._get_nested_dict_value(doc["metadata"], key)
|
| 134 |
-
if doc_value not in values:
|
| 135 |
-
skip = True
|
| 136 |
-
break
|
| 137 |
-
if skip:
|
| 138 |
-
continue
|
| 139 |
-
|
| 140 |
-
# Calculate similarity
|
| 141 |
similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# Sort by similarity
|
| 149 |
results.sort(key=lambda x: x["score"], reverse=True)
|
| 150 |
-
|
| 151 |
-
# Re-rank results based on chunk position and metadata
|
| 152 |
-
reranked_results = self._rerank_results(results[:k*2], query)
|
| 153 |
-
|
| 154 |
-
return reranked_results[:k]
|
| 155 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
def _rerank_results(self, results: List[Dict], query: str) -> List[Dict]:
|
| 157 |
"""Re-rank results considering chunk position and metadata relevance."""
|
| 158 |
for result in results:
|
|
|
|
| 112 |
|
| 113 |
return chunks
|
| 114 |
|
| 115 |
+
def similarity_search(self, query: str, k: int = 3) -> List[Dict]:
|
| 116 |
+
"""Perform similarity search with error handling."""
|
| 117 |
+
try:
|
| 118 |
+
# If no vectors are stored yet, return empty list
|
| 119 |
+
if not self.vectors:
|
| 120 |
+
return []
|
| 121 |
+
|
|
|
|
|
|
|
| 122 |
query_vector = self.model.encode(query, convert_to_tensor=True)
|
|
|
|
|
|
|
| 123 |
results = []
|
| 124 |
+
|
| 125 |
for doc in self.vectors:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
|
| 127 |
+
results.append({
|
| 128 |
+
"text": doc["text"],
|
| 129 |
+
"metadata": doc["metadata"],
|
| 130 |
+
"score": similarity
|
| 131 |
+
})
|
| 132 |
+
|
| 133 |
+
# Sort by similarity and return top k
|
| 134 |
results.sort(key=lambda x: x["score"], reverse=True)
|
| 135 |
+
return results[:k]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
+
except Exception as e:
|
| 138 |
+
st.error(f"Error in similarity search: {str(e)}")
|
| 139 |
+
return []
|
| 140 |
+
|
| 141 |
def _rerank_results(self, results: List[Dict], query: str) -> List[Dict]:
|
| 142 |
"""Re-rank results considering chunk position and metadata relevance."""
|
| 143 |
for result in results:
|