vikramvasudevan commited on
Commit
4f9f712
·
verified ·
1 Parent(s): 7f4024a

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. db.py +71 -0
  2. server.py +37 -26
db.py CHANGED
@@ -188,6 +188,77 @@ class SanatanDatabase:
188
  "metadatas": [all_data["metadatas"][min_index]],
189
  }
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def fetch_all_matches(
192
  self,
193
  collection_name: str,
 
188
  "metadatas": [all_data["metadatas"][min_index]],
189
  }
190
 
191
+ def count_where(
192
+ self,
193
+ collection_name: str,
194
+ metadata_where_clause: MetadataWhereClause = None,
195
+ ) -> int:
196
+ """
197
+ Count the number of matching verses in the collection without fetching documents.
198
+ Uses the same filtering and fallback logic as fetch_all_matches.
199
+ """
200
+
201
+ def normalize_for_match(s: str) -> str:
202
+ s = unicodedata.normalize("NFD", s)
203
+ s = "".join(ch for ch in s if not unicodedata.combining(ch))
204
+ return s
205
+
206
+ logger.info(
207
+ "count_where: counting matches in [%s] | filters=%s",
208
+ collection_name,
209
+ metadata_where_clause,
210
+ )
211
+
212
+ collection = self.chroma_client.get_or_create_collection(name=collection_name)
213
+ where_clause = (
214
+ metadata_where_clause.to_chroma_where() if metadata_where_clause else None
215
+ )
216
+
217
+ # If conversion returns an empty dict, treat as None
218
+ if isinstance(where_clause, dict) and not where_clause:
219
+ where_clause = None
220
+
221
+ # Strict filter first
222
+ data = collection.get(include=["metadatas"], where=where_clause)
223
+
224
+ if not data["metadatas"]:
225
+ # fallback regex
226
+ logger.warning("count_where: No matches found with strict filter. Trying regex fallback.")
227
+
228
+ if not metadata_where_clause or not metadata_where_clause.filters:
229
+ return 0
230
+
231
+ regex_filters = [
232
+ f
233
+ for f in metadata_where_clause.filters
234
+ if f.metadata_search_operator == "$eq"
235
+ and isinstance(f.metadata_value, str)
236
+ ]
237
+
238
+ if regex_filters:
239
+ all_data = collection.get(include=["metadatas"])
240
+ matched_count = 0
241
+ for meta in all_data["metadatas"]:
242
+ ok = True
243
+ for f in regex_filters:
244
+ field_val = str(meta.get(f.metadata_field, ""))
245
+ norm_val = normalize_for_match(field_val)
246
+ norm_query = normalize_for_match(f.metadata_value)
247
+
248
+ if not re.search(
249
+ re.escape(norm_query), norm_val, flags=re.IGNORECASE
250
+ ):
251
+ ok = False
252
+ break
253
+ if ok:
254
+ matched_count += 1
255
+ return matched_count
256
+ else:
257
+ return 0
258
+
259
+ # Direct count
260
+ return len(data["metadatas"])
261
+
262
  def fetch_all_matches(
263
  self,
264
  collection_name: str,
server.py CHANGED
@@ -443,22 +443,23 @@ async def search_scripture_find_all_matches(
443
  - `filter_obj`: MetadataWhereClause (filters, groups, operator)
444
  - `page`: 1-based page number
445
  - `page_size`: Number of results per page
446
- - `has_audio` : optional. can take values any|none|recitation|virutham|upanyasam
447
  """
448
  filter_obj = req.filter_obj
449
- page = req.page
450
- page_size = req.page_size
451
  has_audio = req.has_audio
452
- try:
453
- logger.info(
454
- "search_scripture_find_all_matches: searching for %s with filters %s | page=%s, page_size=%s, has_audio=%s",
455
- scripture_name,
456
- filter_obj,
457
- page,
458
- page_size,
459
- has_audio,
460
- )
461
 
 
 
 
 
 
 
 
 
 
 
462
  db = SanatanDatabase()
463
  config = next(
464
  (s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
@@ -467,16 +468,17 @@ async def search_scripture_find_all_matches(
467
  if not config:
468
  return {"error": f"Scripture '{scripture_name}' not found"}
469
 
470
- # 1️⃣ Fetch all matching metadata WITHOUT pagination yet
 
471
  results = db.fetch_all_matches(
472
  collection_name=config["collection_name"],
473
  metadata_where_clause=filter_obj,
474
- page=None, # Fetch all to apply audio filter
475
- page_size=None,
476
  )
477
 
478
  formatted_results = []
479
- all_indices = [] # Keep track of all _global_index
480
  for i in range(len(results["metadatas"])):
481
  doc_id = results["ids"][i]
482
  metadata_doc = results["metadatas"][i]
@@ -491,10 +493,10 @@ async def search_scripture_find_all_matches(
491
  formatted_results.append(canonical_doc)
492
  all_indices.append(canonical_doc["_global_index"])
493
 
494
- # 2️⃣ Apply has_audio filter
495
  if has_audio:
496
  if has_audio == AudioType.none:
497
- # Fetch all indices that have any audio type
498
  all_audio_indices = set()
499
  for atype in [
500
  AudioType.recitation,
@@ -505,15 +507,14 @@ async def search_scripture_find_all_matches(
505
  indices = await svc_get_indices_with_audio(scripture_name, atype)
506
  all_audio_indices.update(indices)
507
 
508
- # Keep only indices that are NOT in all_audio_indices
509
  formatted_results = [
510
  r
511
  for r in formatted_results
512
  if r["_global_index"] not in all_audio_indices
513
  ]
514
  else:
 
515
  if has_audio == AudioType.any:
516
- # Combine indices for all audio types
517
  audio_indices = set()
518
  for atype in [
519
  AudioType.recitation,
@@ -530,16 +531,26 @@ async def search_scripture_find_all_matches(
530
  await svc_get_indices_with_audio(scripture_name, has_audio)
531
  )
532
 
533
- # Keep only indices that match
534
  formatted_results = [
535
  r for r in formatted_results if r["_global_index"] in audio_indices
536
  ]
537
 
538
- # 3️⃣ Apply pagination on filtered results
539
- total_matches = len(formatted_results)
540
- start_idx = (page - 1) * page_size
541
- end_idx = start_idx + page_size
542
- paginated_results = formatted_results[start_idx:end_idx]
 
 
 
 
 
 
 
 
 
 
 
543
 
544
  return {
545
  "results": paginated_results,
 
443
  - `filter_obj`: MetadataWhereClause (filters, groups, operator)
444
  - `page`: 1-based page number
445
  - `page_size`: Number of results per page
446
+ - `has_audio`: optional. can take values any|none|recitation|virutham|upanyasam
447
  """
448
  filter_obj = req.filter_obj
449
+ page = req.page or 1
450
+ page_size = req.page_size or 20
451
  has_audio = req.has_audio
 
 
 
 
 
 
 
 
 
452
 
453
+ logger.info(
454
+ "search_scripture_find_all_matches: searching for %s | filters=%s | page=%s | page_size=%s | has_audio=%s",
455
+ scripture_name,
456
+ filter_obj,
457
+ page,
458
+ page_size,
459
+ has_audio,
460
+ )
461
+
462
+ try:
463
  db = SanatanDatabase()
464
  config = next(
465
  (s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
 
468
  if not config:
469
  return {"error": f"Scripture '{scripture_name}' not found"}
470
 
471
+ # 1️⃣ Decide how much to fetch
472
+ fetch_all = has_audio is not None
473
  results = db.fetch_all_matches(
474
  collection_name=config["collection_name"],
475
  metadata_where_clause=filter_obj,
476
+ page=None if fetch_all else page,
477
+ page_size=None if fetch_all else page_size,
478
  )
479
 
480
  formatted_results = []
481
+ all_indices = []
482
  for i in range(len(results["metadatas"])):
483
  doc_id = results["ids"][i]
484
  metadata_doc = results["metadatas"][i]
 
493
  formatted_results.append(canonical_doc)
494
  all_indices.append(canonical_doc["_global_index"])
495
 
496
+ # 2️⃣ Apply audio filter only if requested
497
  if has_audio:
498
  if has_audio == AudioType.none:
499
+ # Remove anything that *has* any audio
500
  all_audio_indices = set()
501
  for atype in [
502
  AudioType.recitation,
 
507
  indices = await svc_get_indices_with_audio(scripture_name, atype)
508
  all_audio_indices.update(indices)
509
 
 
510
  formatted_results = [
511
  r
512
  for r in formatted_results
513
  if r["_global_index"] not in all_audio_indices
514
  ]
515
  else:
516
+ # Filter for specific or 'any'
517
  if has_audio == AudioType.any:
 
518
  audio_indices = set()
519
  for atype in [
520
  AudioType.recitation,
 
531
  await svc_get_indices_with_audio(scripture_name, has_audio)
532
  )
533
 
 
534
  formatted_results = [
535
  r for r in formatted_results if r["_global_index"] in audio_indices
536
  ]
537
 
538
+ # 3️⃣ Paginate *after* filtering if needed
539
+ total_matches = None
540
+ if fetch_all:
541
+ total_matches = len(formatted_results)
542
+ else:
543
+ total_matches = db.count_where(
544
+ collection_name=config["collection_name"],
545
+ metadata_where_clause=filter_obj,
546
+ )
547
+
548
+ if fetch_all:
549
+ start_idx = (page - 1) * page_size
550
+ end_idx = start_idx + page_size
551
+ paginated_results = formatted_results[start_idx:end_idx]
552
+ else:
553
+ paginated_results = formatted_results
554
 
555
  return {
556
  "results": paginated_results,