Spaces:

vikramvasudevan
/

sanatan_ai

Running

App Files Files Community

vikramvasudevan commited on Oct 9, 2025

Commit

6ad8f62

verified ·

1 Parent(s): 258ca71

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

db.py +91 -0
modules/config/divya_prabandham.py +1 -1
modules/config/sri_stavam.py +1 -1
server.py +71 -3

db.py CHANGED Viewed

@@ -170,6 +170,97 @@ class SanatanDatabase:
             "metadatas": [all_data["metadatas"][min_index]],
         }
     def search(
         self,
         collection_name: str,

             "metadatas": [all_data["metadatas"][min_index]],
         }
+    def fetch_all_matches(
+        self,
+        collection_name: str,
+        metadata_where_clause: MetadataWhereClause = None,
+        page: int = 1,
+        page_size: int = 20,
+    ):
+        """
+        Fetch all matching verses from the collection with optional pagination,
+        sorted by _global_index ascending.
+        """
+        def normalize_for_match(s: str) -> str:
+            s = unicodedata.normalize("NFD", s)
+            s = "".join(ch for ch in s if not unicodedata.combining(ch))
+            return s
+        logger.info(
+            "fetching all matches from [%s] | filters=%s | page=%s | page_size=%s",
+            collection_name,
+            metadata_where_clause,
+            page,
+            page_size,
+        )
+        collection = self.chroma_client.get_or_create_collection(name=collection_name)
+        where_clause = (
+            metadata_where_clause.to_chroma_where() if metadata_where_clause else None
+        )
+        # First, try strict filter
+        data = collection.get(include=["metadatas", "documents"], where=where_clause)
+        if not data["metadatas"]:
+            # fallback regex
+            logger.warning("No data found using strict filter. Trying regex fallback.")
+            if not metadata_where_clause or not metadata_where_clause.filters:
+                return {"ids": [], "documents": [], "metadatas": [], "total_matches": 0}
+            regex_filters = [
+                f
+                for f in metadata_where_clause.filters
+                if f.metadata_search_operator == "$eq" and isinstance(f.metadata_value, str)
+            ]
+            if regex_filters:
+                all_data = collection.get(include=["metadatas", "documents"])
+                matched_indices = []
+                for i, meta in enumerate(all_data["metadatas"]):
+                    ok = True
+                    for f in regex_filters:
+                        field_val = str(meta.get(f.metadata_field, ""))
+                        norm_val = normalize_for_match(field_val)
+                        norm_query = normalize_for_match(f.metadata_value)
+                        if not re.search(re.escape(norm_query), norm_val, flags=re.IGNORECASE):
+                            ok = False
+                            break
+                    if ok:
+                        matched_indices.append(i)
+                data = {
+                    "ids": [all_data["ids"][i] for i in matched_indices],
+                    "documents": [all_data["documents"][i] for i in matched_indices],
+                    "metadatas": [all_data["metadatas"][i] for i in matched_indices],
+                }
+        total_matches = len(data["ids"])
+        if total_matches == 0:
+            return {"ids": [], "documents": [], "metadatas": [], "total_matches": 0}
+        # --- Sort by _global_index ascending ---
+        combined = list(zip(data["ids"], data["documents"], data["metadatas"]))
+        combined.sort(key=lambda x: x[2].get("_global_index", float("inf")))
+        ids_sorted, documents_sorted, metadatas_sorted = zip(*combined)
+        # Apply pagination
+        start = (page - 1) * page_size
+        end = start + page_size
+        paged_data = {
+            "ids": list(ids_sorted[start:end]),
+            "documents": list(documents_sorted[start:end]),
+            "metadatas": list(metadatas_sorted[start:end]),
+            "total_matches": total_matches,
+        }
+        return paged_data
     def search(
         self,
         collection_name: str,

modules/config/divya_prabandham.py CHANGED Viewed

@@ -235,7 +235,7 @@ divya_prabandham_config = {
                 "url": "https://www.youtube.com/@jagadacharya2405",
                 "role": "Upanyasam video provider",
                 "context": [{"start": 474, "end": 503, "description": "Thiruppaavai"}],
-                "photo_url": "https://media.licdn.com/dms/image/v2/D5603AQEUF-JSpuLc0g/profile-displayphoto-crop_800_800/B56ZhAx3eeH0AQ-/0/1753433471696?e=1762992000&v=beta&t=tt715wrcPzRPDbpFJFOjcR89lPb7H66pMmWw6zsUPiI",
             },
             {
                 "name": "Srirangam Vikram Vasudevan",

                 "url": "https://www.youtube.com/@jagadacharya2405",
                 "role": "Upanyasam video provider",
                 "context": [{"start": 474, "end": 503, "description": "Thiruppaavai"}],
+                "photo_url": "https://drive.google.com/uc?export=download&id=1sM9-BiYRbjABJeihk5q6E2Lc5gc0_72V",
             },
             {
                 "name": "Srirangam Vikram Vasudevan",

modules/config/sri_stavam.py CHANGED Viewed

@@ -98,7 +98,7 @@ sri_stavam_config = {
                 "url": "https://www.youtube.com/@jagadacharya2405",
                 "role": "Upanyasam video provider",
                 "context": [{"start": 1, "end": 10, "description": "Sri Stavam"}],
-                "photo_url": "https://media.licdn.com/dms/image/v2/D5603AQEUF-JSpuLc0g/profile-displayphoto-crop_800_800/B56ZhAx3eeH0AQ-/0/1753433471696?e=1762992000&v=beta&t=tt715wrcPzRPDbpFJFOjcR89lPb7H66pMmWw6zsUPiI",
             },
         ],
     },

                 "url": "https://www.youtube.com/@jagadacharya2405",
                 "role": "Upanyasam video provider",
                 "context": [{"start": 1, "end": 10, "description": "Sri Stavam"}],
+                "photo_url": "https://drive.google.com/uc?export=download&id=1sM9-BiYRbjABJeihk5q6E2Lc5gc0_72V",
             },
         ],
     },

server.py CHANGED Viewed

@@ -2,9 +2,9 @@
 import json
 import random
 import traceback
-from typing import Optional
 import uuid
-from fastapi import APIRouter, Request
 from fastapi.responses import JSONResponse
 import pycountry
 from pydantic import BaseModel
@@ -286,7 +286,7 @@ async def get_scripture_configs():
 @router.post("/scripture/{scripture_name}/search")
-async def search_scripture(
     scripture_name: str,
     filter_obj: Optional[MetadataWhereClause] = None,
 ):
@@ -336,6 +336,74 @@ async def search_scripture(
         logger.error("Error while searching %s", e, exc_info=True)
         return {"error": str(e)}
 @router.post("/audio")
 async def generate_audio_urls(req: AudioRequest):

 import json
 import random
 import traceback
+from typing import List, Optional
 import uuid
+from fastapi import APIRouter, Request, Query
 from fastapi.responses import JSONResponse
 import pycountry
 from pydantic import BaseModel
 @router.post("/scripture/{scripture_name}/search")
+async def search_scripture_find_first_match(
     scripture_name: str,
     filter_obj: Optional[MetadataWhereClause] = None,
 ):
         logger.error("Error while searching %s", e, exc_info=True)
         return {"error": str(e)}
+class ScriptureMultiSearchRequest(BaseModel):
+    filter_obj: Optional[MetadataWhereClause] = None
+    page: int = 1
+    page_size: int = 20
+@router.post("/scripture/{scripture_name}/search/all")
+async def search_scripture_find_all_matches(
+    scripture_name: str,
+    req: ScriptureMultiSearchRequest
+):
+    """
+    Search scripture collection and return all matching results with pagination.
+    - `scripture_name`: Name of the collection
+    - `filter_obj`: MetadataWhereClause (filters, groups, operator)
+    - `page`: 1-based page number
+    - `page_size`: Number of results per page
+    """
+    filter_obj = req.filter_obj
+    page = req.page
+    page_size = req.page_size
+    try:
+        logger.info(
+            "search_scripture_find_all_matches: searching for %s with filters %s | page=%s, page_size=%s",
+            scripture_name,
+            filter_obj,
+            page,
+            page_size,
+        )
+        db = SanatanDatabase()
+        config = next(
+            (s for s in SanatanConfig().scriptures if s["name"] == scripture_name),
+            None,
+        )
+        if not config:
+            return {"error": f"Scripture '{scripture_name}' not found"}
+        results = db.fetch_all_matches(
+            collection_name=config["collection_name"],
+            metadata_where_clause=filter_obj,
+            page=page,
+            page_size=page_size,
+        )
+        # Flatten + canonicalize results
+        formatted_results = []
+        for i in range(len(results["metadatas"])):
+            doc_id = results["ids"][i]
+            metadata_doc = results["metadatas"][i]
+            metadata_doc["id"] = doc_id
+            document_text = results["documents"][i] if results.get("documents") else None
+            canonical_doc = SanatanConfig().canonicalize_document(
+                scripture_name, document_text, metadata_doc
+            )
+            formatted_results.append(canonical_doc)
+        return {
+            "results": formatted_results,
+            "total_matches": results.get("total_matches", 0),
+            "page": page,
+            "page_size": page_size,
+        }
+    except Exception as e:
+        logger.error("Error while searching %s", e, exc_info=True)
+        return {"error": str(e)}
 @router.post("/audio")
 async def generate_audio_urls(req: AudioRequest):