Spaces:

vikramvasudevan
/

sanatan_ai

Running

App Files Files Community

vikramvasudevan commited on Mar 9

Commit

3e6639b

verified ·

1 Parent(s): d3f9591

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

.gitignore +2 -1
config.py +40 -0
db.py +25 -0
modules/config/divya_prabandham.py +39 -1
server.py +89 -1

.gitignore CHANGED Viewed

@@ -21,4 +21,5 @@ chromadb-store_BLOATED.zip
 chromadb-store_OLD.zip
 chromadb-store-20251205.zip
 chromadb-store-20250612.zip
-chromadb-store-bkp-20251212/

 chromadb-store_OLD.zip
 chromadb-store-20251205.zip
 chromadb-store-20250612.zip
+chromadb-store-bkp-20251212/
+cache/

config.py CHANGED Viewed

@@ -3,7 +3,11 @@ from typing import List, Dict
 from modules.config import scripture_configurations
 from modules.languages.transliterator import fn_transliterate
 class SanatanConfig:
     dbStorePath: str = "./chromadb-store"
@@ -97,6 +101,7 @@ class SanatanConfig:
             "chapter_name",
             "relative_path",
             "location",
         }
         config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
@@ -135,6 +140,13 @@ class SanatanConfig:
             if key in allowed_keys:
                 canonical_doc[key] = resolve_field(field)
         # Add standard fields from config
         canonical_doc["scripture_name"] = config.get("name")
         canonical_doc["scripture_title"] = config.get("title")
@@ -168,6 +180,34 @@ class SanatanConfig:
         collection_name = config.get("collection_name")
         return collection_name
 if __name__ == "__main__":
     print(SanatanConfig.scriptures)

 from modules.config import scripture_configurations
 from modules.languages.transliterator import fn_transliterate
+import logging
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 class SanatanConfig:
     dbStorePath: str = "./chromadb-store"
             "chapter_name",
             "relative_path",
             "location",
+            "hierarchy"
         }
         config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
             if key in allowed_keys:
                 canonical_doc[key] = resolve_field(field)
+        # Ensure hierarchy is always a list for uniform processing in the script
+        if "hierarchy" not in canonical_doc or not canonical_doc["hierarchy"]:
+            # Default fallback to just chapter_name if no hierarchy defined
+            canonical_doc["hierarchy"] = [
+                {"type": "chapter", "name": canonical_doc.get("chapter_name", "General")}
+            ]
         # Add standard fields from config
         canonical_doc["scripture_name"] = config.get("name")
         canonical_doc["scripture_title"] = config.get("title")
         collection_name = config.get("collection_name")
         return collection_name
+    def get_hierarchy_structure(self, scripture_name: str) -> List[str]:
+        """
+        Dynamically determines the hierarchy keys (types) for a scripture
+        by inspecting its hierarchy mapping lambda.
+        """
+        config = self.get_scripture_by_name(scripture_name)
+        mapping = config.get("field_mapping", {})
+        hierarchy_fn = mapping.get("hierarchy")
+        if not hierarchy_fn or not callable(hierarchy_fn):
+            # Fallback if no hierarchy is defined
+            return ["chapter"]
+        # Dry run the lambda with a mock doc to extract the types.
+        # We provide a mock that returns a string for any key to ensure
+        # the lambda doesn't crash on .get() calls.
+        class MockDoc(dict):
+            def get(self, key, default=None):
+                return "mock_val"
+        try:
+            sample_hierarchy = hierarchy_fn(MockDoc())
+            # Extract 'type' from each level in the hierarchy, ignoring None
+            return [level["type"] for level in sample_hierarchy if level and "type" in level]
+        except Exception as e:
+            logger.error(f"Error detecting hierarchy for {scripture_name}: {e}")
+            return ["chapter"]
 if __name__ == "__main__":
     print(SanatanConfig.scriptures)

db.py CHANGED Viewed

@@ -1037,3 +1037,28 @@ class SanatanDatabase:
         nalayiram_helper.delete_taniyan(
             self.get_collection_cached("divya_prabandham")
         )

         nalayiram_helper.delete_taniyan(
             self.get_collection_cached("divya_prabandham")
         )
+    def fetch_all_metadata(self, collection_name: str):
+            """
+            Fetches all metadata entries for a given collection.
+            This is used to build the Table of Contents hierarchy.
+            """
+            logger.info("fetch_all_metadata: Fetching all metadata for [%s]", collection_name)
+            collection = self.get_collection_cached(name=collection_name)
+            # We only need 'metadatas' to build the TOC, not 'documents' or 'embeddings'
+            # This keeps the payload small and fast.
+            try:
+                # Note: Chroma's .get() without a limit usually fetches everything,
+                # but being explicit is safer for large collections.
+                results = collection.get(
+                    include=["metadatas"]
+                )
+                metadatas = results.get("metadatas", [])
+                logger.info("fetch_all_metadata: Successfully retrieved %d metadata records", len(metadatas))
+                return metadatas
+            except Exception as e:
+                logger.error("Error in fetch_all_metadata for %s: %s", collection_name, e, exc_info=True)
+                return []

modules/config/divya_prabandham.py CHANGED Viewed

@@ -16,7 +16,32 @@ divya_prabandham_config = {
     "unit_field": "verse",
     "field_mapping": {
         "text": "pasuram_ta",
-        "title": lambda doc: f"{doc.get('prabandham_name','')} {doc.get('chapter','')}-{doc.get('decade','')}:{doc.get('position_in_chapter','')}",
         "location": "divya_desams",
         "word_by_word_native": "wbw_ta",
         "unit_index": "verse",
@@ -55,6 +80,19 @@ divya_prabandham_config = {
                 ],
             )
         ),
     },
     "metadata_fields": [
         {

     "unit_field": "verse",
     "field_mapping": {
         "text": "pasuram_ta",
+        "chapter_name": "prabandham_name",
+        "title": lambda doc: (
+            f"{doc.get('prabandham_name', '')}:"
+            + ".".join(
+                filter(
+                    None,
+                    [
+                        (
+                            str(doc.get("decade"))
+                            if doc.get("decade") not in [None, -1, "-1"]
+                            else None
+                        ),
+                        (
+                            str(doc.get("chapter"))
+                            if doc.get("chapter") not in [None, -1, "-1"]
+                            else None
+                        ),
+                        (
+                            str(doc.get("position_in_chapter"))
+                            if doc.get("position_in_chapter") not in [None, -1, "-1"]
+                            else None
+                        ),
+                    ],
+                )
+            )
+        ).strip(":"),
         "location": "divya_desams",
         "word_by_word_native": "wbw_ta",
         "unit_index": "verse",
                 ],
             )
         ),
+        "hierarchy": lambda doc: [
+            {"type": "prabandham", "name": doc.get("prabandham_name")},
+            (
+                {"type": "decade", "name": f"Decade {doc.get('decade')}"}
+                if doc.get("decade") not in [None, -1, "-1"]
+                else None
+            ),
+            (
+                {"type": "padigam", "name": f"Padigam {doc.get('chapter')}"}
+                if doc.get("chapter") not in [None, -1, "-1"]
+                else None
+            ),
+        ],
     },
     "metadata_fields": [
         {

server.py CHANGED Viewed

@@ -3,6 +3,7 @@ import json
 import os
 from pathlib import Path
 import random
 import traceback
 from typing import Optional
 import uuid
@@ -700,4 +701,91 @@ async def get_translations():
     except FileNotFoundError:
         raise HTTPException(status_code=404, detail="translations.json not found")
     except json.JSONDecodeError:
-        raise HTTPException(status_code=500, detail="Invalid JSON format")

 import os
 from pathlib import Path
 import random
+import re
 import traceback
 from typing import Optional
 import uuid
     except FileNotFoundError:
         raise HTTPException(status_code=404, detail="translations.json not found")
     except json.JSONDecodeError:
+        raise HTTPException(status_code=500, detail="Invalid JSON format")
+import time
+TOC_CACHE_DIR = Path("./cache/toc")
+TOC_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+@router.get("/scripture/{scripture_name}/toc")
+async def get_scripture_toc(scripture_name: str, force_refresh: bool = False):
+    cache_file = TOC_CACHE_DIR / f"{scripture_name}.json"
+    # 1. Check if cache exists and is fresh (e.g., less than 24h old)
+    if not force_refresh and cache_file.exists():
+        # Optional: check file age if you update DB frequently
+        with open(cache_file, "r", encoding="utf-8") as f:
+            return json.load(f)
+    config_service = SanatanConfig()
+    config = config_service.get_scripture_by_name(scripture_name)
+    db = SanatanDatabase()
+    # 2. Fetch all metadata (Fast)
+    records = db.fetch_all_metadata(config["collection_name"])
+    # 3. Build tree using a nested dictionary for O(1) lookups
+    # This avoids the slow next(...) calls and canonicalize_document
+    root_nodes = {}
+    for meta in records:
+        # Instead of canonicalizing the WHOLE doc, we only run the hierarchy lambda
+        # This is the secret to speed.
+        hierarchy_fn = config.get("field_mapping", {}).get("hierarchy")
+        if not hierarchy_fn or not callable(hierarchy_fn):
+            continue
+        hierarchy_data = hierarchy_fn(meta)
+        global_index = meta.get("_global_index")
+        curr_level = root_nodes
+        for entry in hierarchy_data:
+            if entry is None: continue
+            node_name = entry.get("name")
+            if not node_name: continue
+            if node_name not in curr_level:
+                curr_level[node_name] = {
+                    "name": node_name,
+                    "type": entry.get("type"),
+                    "first_index": global_index,
+                    "children": {}
+                }
+            # Update first index if earlier
+            if global_index is not None:
+                if curr_level[node_name]["first_index"] is None or global_index < curr_level[node_name]["first_index"]:
+                    curr_level[node_name]["first_index"] = global_index
+            curr_level = curr_level[node_name]["children"]
+    # 4. Convert nested dicts back to lists and sort
+    def natural_sort_key(s):
+        """
+        Helper to sort strings containing numbers naturally.
+        'Chapter 2' < 'Chapter 10'
+        """
+        return [int(text) if text.isdigit() else text.lower()
+                for text in re.split('([0-9]+)', str(s))]
+    def dict_to_sorted_list(node_dict):
+        nodes = []
+        for key in node_dict:
+            node = node_dict[key]
+            node["children"] = dict_to_sorted_list(node["children"])
+            nodes.append(node)
+        # SORT BY NAME INSTEAD OF INDEX
+        nodes.sort(key=lambda x: natural_sort_key(x["name"]))
+        return nodes
+    final_toc = dict_to_sorted_list(root_nodes)
+    response_data = {"toc": final_toc}
+    # 5. Save to cache
+    with open(cache_file, "w", encoding="utf-8") as f:
+        json.dump(response_data, f, ensure_ascii=False)
+    return response_data