vikramvasudevan commited on
Commit
6ad8f62
·
verified ·
1 Parent(s): 258ca71

Upload folder using huggingface_hub

Browse files
db.py CHANGED
@@ -170,6 +170,97 @@ class SanatanDatabase:
170
  "metadatas": [all_data["metadatas"][min_index]],
171
  }
172
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  def search(
174
  self,
175
  collection_name: str,
 
170
  "metadatas": [all_data["metadatas"][min_index]],
171
  }
172
 
173
+ def fetch_all_matches(
174
+ self,
175
+ collection_name: str,
176
+ metadata_where_clause: MetadataWhereClause = None,
177
+ page: int = 1,
178
+ page_size: int = 20,
179
+ ):
180
+ """
181
+ Fetch all matching verses from the collection with optional pagination,
182
+ sorted by _global_index ascending.
183
+ """
184
+ def normalize_for_match(s: str) -> str:
185
+ s = unicodedata.normalize("NFD", s)
186
+ s = "".join(ch for ch in s if not unicodedata.combining(ch))
187
+ return s
188
+
189
+ logger.info(
190
+ "fetching all matches from [%s] | filters=%s | page=%s | page_size=%s",
191
+ collection_name,
192
+ metadata_where_clause,
193
+ page,
194
+ page_size,
195
+ )
196
+
197
+ collection = self.chroma_client.get_or_create_collection(name=collection_name)
198
+ where_clause = (
199
+ metadata_where_clause.to_chroma_where() if metadata_where_clause else None
200
+ )
201
+
202
+ # First, try strict filter
203
+ data = collection.get(include=["metadatas", "documents"], where=where_clause)
204
+
205
+ if not data["metadatas"]:
206
+ # fallback regex
207
+ logger.warning("No data found using strict filter. Trying regex fallback.")
208
+
209
+ if not metadata_where_clause or not metadata_where_clause.filters:
210
+ return {"ids": [], "documents": [], "metadatas": [], "total_matches": 0}
211
+
212
+ regex_filters = [
213
+ f
214
+ for f in metadata_where_clause.filters
215
+ if f.metadata_search_operator == "$eq" and isinstance(f.metadata_value, str)
216
+ ]
217
+
218
+ if regex_filters:
219
+ all_data = collection.get(include=["metadatas", "documents"])
220
+ matched_indices = []
221
+ for i, meta in enumerate(all_data["metadatas"]):
222
+ ok = True
223
+ for f in regex_filters:
224
+ field_val = str(meta.get(f.metadata_field, ""))
225
+ norm_val = normalize_for_match(field_val)
226
+ norm_query = normalize_for_match(f.metadata_value)
227
+
228
+ if not re.search(re.escape(norm_query), norm_val, flags=re.IGNORECASE):
229
+ ok = False
230
+ break
231
+ if ok:
232
+ matched_indices.append(i)
233
+
234
+ data = {
235
+ "ids": [all_data["ids"][i] for i in matched_indices],
236
+ "documents": [all_data["documents"][i] for i in matched_indices],
237
+ "metadatas": [all_data["metadatas"][i] for i in matched_indices],
238
+ }
239
+
240
+ total_matches = len(data["ids"])
241
+ if total_matches == 0:
242
+ return {"ids": [], "documents": [], "metadatas": [], "total_matches": 0}
243
+
244
+ # --- Sort by _global_index ascending ---
245
+ combined = list(zip(data["ids"], data["documents"], data["metadatas"]))
246
+ combined.sort(key=lambda x: x[2].get("_global_index", float("inf")))
247
+
248
+ ids_sorted, documents_sorted, metadatas_sorted = zip(*combined)
249
+
250
+ # Apply pagination
251
+ start = (page - 1) * page_size
252
+ end = start + page_size
253
+
254
+ paged_data = {
255
+ "ids": list(ids_sorted[start:end]),
256
+ "documents": list(documents_sorted[start:end]),
257
+ "metadatas": list(metadatas_sorted[start:end]),
258
+ "total_matches": total_matches,
259
+ }
260
+
261
+ return paged_data
262
+
263
+
264
  def search(
265
  self,
266
  collection_name: str,
modules/config/divya_prabandham.py CHANGED
@@ -235,7 +235,7 @@ divya_prabandham_config = {
235
  "url": "https://www.youtube.com/@jagadacharya2405",
236
  "role": "Upanyasam video provider",
237
  "context": [{"start": 474, "end": 503, "description": "Thiruppaavai"}],
238
- "photo_url": "https://media.licdn.com/dms/image/v2/D5603AQEUF-JSpuLc0g/profile-displayphoto-crop_800_800/B56ZhAx3eeH0AQ-/0/1753433471696?e=1762992000&v=beta&t=tt715wrcPzRPDbpFJFOjcR89lPb7H66pMmWw6zsUPiI",
239
  },
240
  {
241
  "name": "Srirangam Vikram Vasudevan",
 
235
  "url": "https://www.youtube.com/@jagadacharya2405",
236
  "role": "Upanyasam video provider",
237
  "context": [{"start": 474, "end": 503, "description": "Thiruppaavai"}],
238
+ "photo_url": "https://drive.google.com/uc?export=download&id=1sM9-BiYRbjABJeihk5q6E2Lc5gc0_72V",
239
  },
240
  {
241
  "name": "Srirangam Vikram Vasudevan",
modules/config/sri_stavam.py CHANGED
@@ -98,7 +98,7 @@ sri_stavam_config = {
98
  "url": "https://www.youtube.com/@jagadacharya2405",
99
  "role": "Upanyasam video provider",
100
  "context": [{"start": 1, "end": 10, "description": "Sri Stavam"}],
101
- "photo_url": "https://media.licdn.com/dms/image/v2/D5603AQEUF-JSpuLc0g/profile-displayphoto-crop_800_800/B56ZhAx3eeH0AQ-/0/1753433471696?e=1762992000&v=beta&t=tt715wrcPzRPDbpFJFOjcR89lPb7H66pMmWw6zsUPiI",
102
  },
103
  ],
104
  },
 
98
  "url": "https://www.youtube.com/@jagadacharya2405",
99
  "role": "Upanyasam video provider",
100
  "context": [{"start": 1, "end": 10, "description": "Sri Stavam"}],
101
+ "photo_url": "https://drive.google.com/uc?export=download&id=1sM9-BiYRbjABJeihk5q6E2Lc5gc0_72V",
102
  },
103
  ],
104
  },
server.py CHANGED
@@ -2,9 +2,9 @@
2
  import json
3
  import random
4
  import traceback
5
- from typing import Optional
6
  import uuid
7
- from fastapi import APIRouter, Request
8
  from fastapi.responses import JSONResponse
9
  import pycountry
10
  from pydantic import BaseModel
@@ -286,7 +286,7 @@ async def get_scripture_configs():
286
 
287
 
288
  @router.post("/scripture/{scripture_name}/search")
289
- async def search_scripture(
290
  scripture_name: str,
291
  filter_obj: Optional[MetadataWhereClause] = None,
292
  ):
@@ -336,6 +336,74 @@ async def search_scripture(
336
  logger.error("Error while searching %s", e, exc_info=True)
337
  return {"error": str(e)}
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  @router.post("/audio")
341
  async def generate_audio_urls(req: AudioRequest):
 
2
  import json
3
  import random
4
  import traceback
5
+ from typing import List, Optional
6
  import uuid
7
+ from fastapi import APIRouter, Request, Query
8
  from fastapi.responses import JSONResponse
9
  import pycountry
10
  from pydantic import BaseModel
 
286
 
287
 
288
  @router.post("/scripture/{scripture_name}/search")
289
+ async def search_scripture_find_first_match(
290
  scripture_name: str,
291
  filter_obj: Optional[MetadataWhereClause] = None,
292
  ):
 
336
  logger.error("Error while searching %s", e, exc_info=True)
337
  return {"error": str(e)}
338
 
339
+ class ScriptureMultiSearchRequest(BaseModel):
340
+ filter_obj: Optional[MetadataWhereClause] = None
341
+ page: int = 1
342
+ page_size: int = 20
343
+
344
+ @router.post("/scripture/{scripture_name}/search/all")
345
+ async def search_scripture_find_all_matches(
346
+ scripture_name: str,
347
+ req: ScriptureMultiSearchRequest
348
+ ):
349
+ """
350
+ Search scripture collection and return all matching results with pagination.
351
+ - `scripture_name`: Name of the collection
352
+ - `filter_obj`: MetadataWhereClause (filters, groups, operator)
353
+ - `page`: 1-based page number
354
+ - `page_size`: Number of results per page
355
+ """
356
+ filter_obj = req.filter_obj
357
+ page = req.page
358
+ page_size = req.page_size
359
+ try:
360
+ logger.info(
361
+ "search_scripture_find_all_matches: searching for %s with filters %s | page=%s, page_size=%s",
362
+ scripture_name,
363
+ filter_obj,
364
+ page,
365
+ page_size,
366
+ )
367
+
368
+ db = SanatanDatabase()
369
+ config = next(
370
+ (s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
371
+ None,
372
+ )
373
+ if not config:
374
+ return {"error": f"Scripture '{scripture_name}' not found"}
375
+
376
+ results = db.fetch_all_matches(
377
+ collection_name=config["collection_name"],
378
+ metadata_where_clause=filter_obj,
379
+ page=page,
380
+ page_size=page_size,
381
+ )
382
+
383
+ # Flatten + canonicalize results
384
+ formatted_results = []
385
+ for i in range(len(results["metadatas"])):
386
+ doc_id = results["ids"][i]
387
+ metadata_doc = results["metadatas"][i]
388
+ metadata_doc["id"] = doc_id
389
+
390
+ document_text = results["documents"][i] if results.get("documents") else None
391
+
392
+ canonical_doc = SanatanConfig().canonicalize_document(
393
+ scripture_name, document_text, metadata_doc
394
+ )
395
+ formatted_results.append(canonical_doc)
396
+
397
+ return {
398
+ "results": formatted_results,
399
+ "total_matches": results.get("total_matches", 0),
400
+ "page": page,
401
+ "page_size": page_size,
402
+ }
403
+
404
+ except Exception as e:
405
+ logger.error("Error while searching %s", e, exc_info=True)
406
+ return {"error": str(e)}
407
 
408
  @router.post("/audio")
409
  async def generate_audio_urls(req: AudioRequest):