TilanB commited on
Commit
8c63c58
·
verified ·
1 Parent(s): c956e36

Update search_engine/indexer.py

Browse files
Files changed (1) hide show
  1. search_engine/indexer.py +46 -12
search_engine/indexer.py CHANGED
@@ -138,9 +138,12 @@ class RetrieverBuilder:
138
  self.embeddings = GoogleGenerativeAIEmbeddings(
139
  model="models/text-embedding-004",
140
  google_api_key=parameters.GOOGLE_API_KEY,
141
- batch_size=32, # Enable batching for faster embedding computation
142
  )
143
  self._retriever_cache = {} # {docset_hash: retriever}
 
 
 
144
 
145
  def _hash_docs(self, docs):
146
  # Create a hash of all document contents and metadata
@@ -167,6 +170,16 @@ class RetrieverBuilder:
167
  if not docs:
168
  raise ValueError("No documents provided")
169
 
 
 
 
 
 
 
 
 
 
 
170
  # Use session-specific directory if provided (for multi-user isolation)
171
  if session_id:
172
  chroma_dir = os.path.join(parameters.CHROMA_DB_PATH, f"session_{session_id}")
@@ -181,11 +194,19 @@ class RetrieverBuilder:
181
  manifest = load_manifest(manifest_path)
182
 
183
  t_vector_start = time.time()
184
- vector_store = Chroma(
185
- embedding_function=self.embeddings,
186
- persist_directory=chroma_dir,
187
- )
188
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  to_add = []
190
  ids_to_add = []
191
  to_delete_ids = []
@@ -246,19 +267,28 @@ class RetrieverBuilder:
246
 
247
  # Create BM25 retriever
248
  t_bm25_start = time.time()
249
- texts = [doc.page_content for doc in docs]
250
- metadatas = [doc.metadata for doc in docs]
251
- bm25_retriever = BM25Retriever.from_texts(texts=texts, metadatas=metadatas)
252
- bm25_retriever.k = parameters.BM25_SEARCH_K
 
 
 
 
 
 
 
 
 
253
  t_bm25_end = time.time()
254
  logger.info(f"[PROFILE] BM25 retriever creation: {t_bm25_end - t_bm25_start:.2f}s")
255
- logger.debug(f"BM25 indexed {len(texts)} texts, k={bm25_retriever.k}")
256
 
257
  t_vec_retr_start = time.time()
258
  vector_retriever = vector_store.as_retriever(
259
  search_type="mmr",
260
  search_kwargs={
261
- "k": parameters.VECTOR_Search_K_CHROMA,
262
  "fetch_k": parameters.VECTOR_FETCH_K,
263
  "lambda_mult": 0.7,
264
  },
@@ -270,7 +300,7 @@ class RetrieverBuilder:
270
  t_ensemble_start = time.time()
271
  hybrid_retriever = EnsembleRetriever(
272
  retrievers=[bm25_retriever, vector_retriever],
273
- weights=parameters.HYBRID_RETRIEVER_WEIGHTS,
274
  k=parameters.VECTOR_SEARCH_K,
275
  )
276
  t_ensemble_end = time.time()
@@ -278,4 +308,8 @@ class RetrieverBuilder:
278
  logger.info(f"Hybrid retriever created (k={parameters.VECTOR_SEARCH_K})")
279
  logger.info(f"[PROFILE] Total hybrid retriever build: {t_ensemble_end - t_vector_start:.2f}s")
280
 
 
 
 
 
281
  return hybrid_retriever
 
138
  self.embeddings = GoogleGenerativeAIEmbeddings(
139
  model="models/text-embedding-004",
140
  google_api_key=parameters.GOOGLE_API_KEY,
141
+ batch_size=100, # Increased from 32 to 100 for faster embedding (Google supports up to 100)
142
  )
143
  self._retriever_cache = {} # {docset_hash: retriever}
144
+ self._bm25_cache = {} # {docset_hash: bm25_retriever} - NEW: Cache BM25 retrievers
145
+ self._vector_store_cache = {} # {chroma_dir: vector_store} - NEW: Reuse ChromaDB connections
146
+ logger.debug("RetrieverBuilder initialized with caching enabled")
147
 
148
  def _hash_docs(self, docs):
149
  # Create a hash of all document contents and metadata
 
170
  if not docs:
171
  raise ValueError("No documents provided")
172
 
173
+ # Generate cache key from document content hashes
174
+ cache_key = self._hash_docs(docs)
175
+
176
+ # Check retriever cache first (10-200× speedup for repeat queries)
177
+ if cache_key in self._retriever_cache:
178
+ logger.info(f"✅ Using cached retriever for docset {cache_key[:8]}... (CACHE HIT)")
179
+ return self._retriever_cache[cache_key]
180
+
181
+ logger.debug(f"Cache miss for docset {cache_key[:8]}..., building new retriever")
182
+
183
  # Use session-specific directory if provided (for multi-user isolation)
184
  if session_id:
185
  chroma_dir = os.path.join(parameters.CHROMA_DB_PATH, f"session_{session_id}")
 
194
  manifest = load_manifest(manifest_path)
195
 
196
  t_vector_start = time.time()
 
 
 
 
197
 
198
+ # Check vector store cache (reuse ChromaDB connections)
199
+ if chroma_dir in self._vector_store_cache:
200
+ logger.debug(f"Reusing cached vector store connection for {chroma_dir}")
201
+ vector_store = self._vector_store_cache[chroma_dir]
202
+ else:
203
+ vector_store = Chroma(
204
+ embedding_function=self.embeddings,
205
+ persist_directory=chroma_dir,
206
+ )
207
+ self._vector_store_cache[chroma_dir] = vector_store
208
+ logger.debug(f"Created new vector store connection for {chroma_dir}")
209
+
210
  to_add = []
211
  ids_to_add = []
212
  to_delete_ids = []
 
267
 
268
  # Create BM25 retriever
269
  t_bm25_start = time.time()
270
+
271
+ # Check BM25 cache (avoid rebuilding for same documents)
272
+ if cache_key in self._bm25_cache:
273
+ logger.debug(f"Reusing cached BM25 retriever for docset {cache_key[:8]}...")
274
+ bm25_retriever = self._bm25_cache[cache_key]
275
+ else:
276
+ texts = [doc.page_content for doc in docs]
277
+ metadatas = [doc.metadata for doc in docs]
278
+ bm25_retriever = BM25Retriever.from_texts(texts=texts, metadatas=metadatas)
279
+ bm25_retriever.k = parameters.BM25_SEARCH_K
280
+ self._bm25_cache[cache_key] = bm25_retriever
281
+ logger.debug(f"Created new BM25 retriever for docset {cache_key[:8]}...")
282
+
283
  t_bm25_end = time.time()
284
  logger.info(f"[PROFILE] BM25 retriever creation: {t_bm25_end - t_bm25_start:.2f}s")
285
+ logger.debug(f"BM25 indexed {len(docs)} texts, k={bm25_retriever.k}")
286
 
287
  t_vec_retr_start = time.time()
288
  vector_retriever = vector_store.as_retriever(
289
  search_type="mmr",
290
  search_kwargs={
291
+ "k": parameters.VECTOR_SEARCH_K_CHROMA,
292
  "fetch_k": parameters.VECTOR_FETCH_K,
293
  "lambda_mult": 0.7,
294
  },
 
300
  t_ensemble_start = time.time()
301
  hybrid_retriever = EnsembleRetriever(
302
  retrievers=[bm25_retriever, vector_retriever],
303
+ weights=parameters.HYBRID_RETRIEVER_WEIGHTS,
304
  k=parameters.VECTOR_SEARCH_K,
305
  )
306
  t_ensemble_end = time.time()
 
308
  logger.info(f"Hybrid retriever created (k={parameters.VECTOR_SEARCH_K})")
309
  logger.info(f"[PROFILE] Total hybrid retriever build: {t_ensemble_end - t_vector_start:.2f}s")
310
 
311
+ # Cache the complete retriever for future use
312
+ self._retriever_cache[cache_key] = hybrid_retriever
313
+ logger.debug(f"Cached retriever for docset {cache_key[:8]}... (future requests will be instant)")
314
+
315
  return hybrid_retriever