Spaces:
Sleeping
Sleeping
Update app/policy_vector_db.py
Browse files- app/policy_vector_db.py +11 -12
app/policy_vector_db.py
CHANGED
|
@@ -23,12 +23,11 @@ class PolicyVectorDB:
|
|
| 23 |
self.collection_name = "neepco_dop_policies"
|
| 24 |
|
| 25 |
# Using a powerful open-source embedding model.
|
| 26 |
-
# Change 'cpu' to 'cuda' if a GPU is available for significantly faster embedding.
|
| 27 |
logger.info("Loading embedding model 'BAAI/bge-large-en-v1.5'. This may take a moment...")
|
| 28 |
self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
|
| 29 |
logger.info("Embedding model loaded successfully.")
|
| 30 |
|
| 31 |
-
self.collection = None
|
| 32 |
self.top_k_default = top_k_default
|
| 33 |
self.relevance_threshold = relevance_threshold
|
| 34 |
|
|
@@ -82,7 +81,7 @@ class PolicyVectorDB:
|
|
| 82 |
logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
|
| 83 |
|
| 84 |
# Process in batches for efficiency
|
| 85 |
-
batch_size = 32
|
| 86 |
for i in range(0, len(new_chunks), batch_size):
|
| 87 |
batch = new_chunks[i:i + batch_size]
|
| 88 |
ids = [str(chunk['id']) for chunk in batch]
|
|
@@ -115,19 +114,19 @@ class PolicyVectorDB:
|
|
| 115 |
# Retrieve more results initially to allow for filtering
|
| 116 |
results = collection.query(
|
| 117 |
query_embeddings=query_embedding,
|
| 118 |
-
n_results=k * 2,
|
| 119 |
include=["documents", "metadatas", "distances"]
|
| 120 |
)
|
| 121 |
|
| 122 |
search_results = []
|
| 123 |
-
if results and results.get('documents') and results['documents']
|
| 124 |
-
for i, doc in enumerate(results['documents'][0]):
|
| 125 |
# The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
|
| 126 |
-
relevance_score = 1 - results['distances'][
|
| 127 |
if relevance_score >= self.relevance_threshold:
|
| 128 |
search_results.append({
|
| 129 |
'text': doc,
|
| 130 |
-
'metadata': results['metadatas'][
|
| 131 |
'relevance_score': relevance_score
|
| 132 |
})
|
| 133 |
|
|
@@ -168,13 +167,13 @@ class PolicyVectorDB:
|
|
| 168 |
)
|
| 169 |
|
| 170 |
search_results = []
|
| 171 |
-
if results and results.get('documents') and results['documents']
|
| 172 |
-
for i, doc in enumerate(results['documents']):
|
| 173 |
-
relevance_score = 1 - results['distances'][i]
|
| 174 |
if relevance_score >= self.relevance_threshold:
|
| 175 |
search_results.append({
|
| 176 |
'text': doc,
|
| 177 |
-
'metadata': results['metadatas'][
|
| 178 |
'relevance_score': relevance_score
|
| 179 |
})
|
| 180 |
|
|
|
|
| 23 |
self.collection_name = "neepco_dop_policies"
|
| 24 |
|
| 25 |
# Using a powerful open-source embedding model.
|
|
|
|
| 26 |
logger.info("Loading embedding model 'BAAI/bge-large-en-v1.5'. This may take a moment...")
|
| 27 |
self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
|
| 28 |
logger.info("Embedding model loaded successfully.")
|
| 29 |
|
| 30 |
+
self.collection = None # Initialize collection as None for lazy loading
|
| 31 |
self.top_k_default = top_k_default
|
| 32 |
self.relevance_threshold = relevance_threshold
|
| 33 |
|
|
|
|
| 81 |
logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
|
| 82 |
|
| 83 |
# Process in batches for efficiency
|
| 84 |
+
batch_size = 32 # Reduced batch size for potentially large embeddings
|
| 85 |
for i in range(0, len(new_chunks), batch_size):
|
| 86 |
batch = new_chunks[i:i + batch_size]
|
| 87 |
ids = [str(chunk['id']) for chunk in batch]
|
|
|
|
| 114 |
# Retrieve more results initially to allow for filtering
|
| 115 |
results = collection.query(
|
| 116 |
query_embeddings=query_embedding,
|
| 117 |
+
n_results=k * 2, # Retrieve more to filter by threshold
|
| 118 |
include=["documents", "metadatas", "distances"]
|
| 119 |
)
|
| 120 |
|
| 121 |
search_results = []
|
| 122 |
+
if results and results.get('documents') and results['documents']:
|
| 123 |
+
for i, doc in enumerate(results['documents'][0]): # ← Fixed indexing
|
| 124 |
# The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
|
| 125 |
+
relevance_score = 1 - results['distances'][i] # ← Fixed indexing
|
| 126 |
if relevance_score >= self.relevance_threshold:
|
| 127 |
search_results.append({
|
| 128 |
'text': doc,
|
| 129 |
+
'metadata': results['metadatas'][i], # ← Fixed indexing
|
| 130 |
'relevance_score': relevance_score
|
| 131 |
})
|
| 132 |
|
|
|
|
| 167 |
)
|
| 168 |
|
| 169 |
search_results = []
|
| 170 |
+
if results and results.get('documents') and results['documents']:
|
| 171 |
+
for i, doc in enumerate(results['documents'][0]): # ← Fixed indexing
|
| 172 |
+
relevance_score = 1 - results['distances'][i] # ← Fixed indexing
|
| 173 |
if relevance_score >= self.relevance_threshold:
|
| 174 |
search_results.append({
|
| 175 |
'text': doc,
|
| 176 |
+
'metadata': results['metadatas'][i], # ← Fixed indexing
|
| 177 |
'relevance_score': relevance_score
|
| 178 |
})
|
| 179 |
|