Kalpokoch commited on
Commit
0c1cd5d
·
verified ·
1 Parent(s): 7118166

Update app/policy_vector_db.py

Browse files
Files changed (1) hide show
  1. app/policy_vector_db.py +11 -12
app/policy_vector_db.py CHANGED
@@ -23,12 +23,11 @@ class PolicyVectorDB:
23
  self.collection_name = "neepco_dop_policies"
24
 
25
  # Using a powerful open-source embedding model.
26
- # Change 'cpu' to 'cuda' if a GPU is available for significantly faster embedding.
27
  logger.info("Loading embedding model 'BAAI/bge-large-en-v1.5'. This may take a moment...")
28
  self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
29
  logger.info("Embedding model loaded successfully.")
30
 
31
- self.collection = None # Initialize collection as None for lazy loading
32
  self.top_k_default = top_k_default
33
  self.relevance_threshold = relevance_threshold
34
 
@@ -82,7 +81,7 @@ class PolicyVectorDB:
82
  logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
83
 
84
  # Process in batches for efficiency
85
- batch_size = 32 # Reduced batch size for potentially large embeddings
86
  for i in range(0, len(new_chunks), batch_size):
87
  batch = new_chunks[i:i + batch_size]
88
  ids = [str(chunk['id']) for chunk in batch]
@@ -115,19 +114,19 @@ class PolicyVectorDB:
115
  # Retrieve more results initially to allow for filtering
116
  results = collection.query(
117
  query_embeddings=query_embedding,
118
- n_results=k * 2, # Retrieve more to filter by threshold
119
  include=["documents", "metadatas", "distances"]
120
  )
121
 
122
  search_results = []
123
- if results and results.get('documents') and results['documents'][0]:
124
- for i, doc in enumerate(results['documents'][0]):
125
  # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
126
- relevance_score = 1 - results['distances'][0][i]
127
  if relevance_score >= self.relevance_threshold:
128
  search_results.append({
129
  'text': doc,
130
- 'metadata': results['metadatas'][0][i],
131
  'relevance_score': relevance_score
132
  })
133
 
@@ -168,13 +167,13 @@ class PolicyVectorDB:
168
  )
169
 
170
  search_results = []
171
- if results and results.get('documents') and results['documents'][0]:
172
- for i, doc in enumerate(results['documents']):
173
- relevance_score = 1 - results['distances'][i]
174
  if relevance_score >= self.relevance_threshold:
175
  search_results.append({
176
  'text': doc,
177
- 'metadata': results['metadatas'][0][i],
178
  'relevance_score': relevance_score
179
  })
180
 
 
23
  self.collection_name = "neepco_dop_policies"
24
 
25
  # Using a powerful open-source embedding model.
 
26
  logger.info("Loading embedding model 'BAAI/bge-large-en-v1.5'. This may take a moment...")
27
  self.embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5', device='cpu')
28
  logger.info("Embedding model loaded successfully.")
29
 
30
+ self.collection = None # Initialize collection as None for lazy loading
31
  self.top_k_default = top_k_default
32
  self.relevance_threshold = relevance_threshold
33
 
 
81
  logger.info(f"Adding {len(new_chunks)} new chunks to the vector database...")
82
 
83
  # Process in batches for efficiency
84
+ batch_size = 32 # Reduced batch size for potentially large embeddings
85
  for i in range(0, len(new_chunks), batch_size):
86
  batch = new_chunks[i:i + batch_size]
87
  ids = [str(chunk['id']) for chunk in batch]
 
114
  # Retrieve more results initially to allow for filtering
115
  results = collection.query(
116
  query_embeddings=query_embedding,
117
+ n_results=k * 2, # Retrieve more to filter by threshold
118
  include=["documents", "metadatas", "distances"]
119
  )
120
 
121
  search_results = []
122
+ if results and results.get('documents') and results['documents']:
123
+ for i, doc in enumerate(results['documents'][0]): # ← Fixed indexing
124
  # The distance for normalized embeddings is often interpreted as 1 - cosine_similarity
125
+ relevance_score = 1 - results['distances'][i] # ← Fixed indexing
126
  if relevance_score >= self.relevance_threshold:
127
  search_results.append({
128
  'text': doc,
129
+ 'metadata': results['metadatas'][i], # ← Fixed indexing
130
  'relevance_score': relevance_score
131
  })
132
 
 
167
  )
168
 
169
  search_results = []
170
+ if results and results.get('documents') and results['documents']:
171
+ for i, doc in enumerate(results['documents'][0]): # ← Fixed indexing
172
+ relevance_score = 1 - results['distances'][i] # ← Fixed indexing
173
  if relevance_score >= self.relevance_threshold:
174
  search_results.append({
175
  'text': doc,
176
+ 'metadata': results['metadatas'][i], # ← Fixed indexing
177
  'relevance_score': relevance_score
178
  })
179