vikramvasudevan commited on
Commit
3e6639b
·
verified ·
1 Parent(s): d3f9591

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitignore +2 -1
  2. config.py +40 -0
  3. db.py +25 -0
  4. modules/config/divya_prabandham.py +39 -1
  5. server.py +89 -1
.gitignore CHANGED
@@ -21,4 +21,5 @@ chromadb-store_BLOATED.zip
21
  chromadb-store_OLD.zip
22
  chromadb-store-20251205.zip
23
  chromadb-store-20250612.zip
24
- chromadb-store-bkp-20251212/
 
 
21
  chromadb-store_OLD.zip
22
  chromadb-store-20251205.zip
23
  chromadb-store-20250612.zip
24
+ chromadb-store-bkp-20251212/
25
+ cache/
config.py CHANGED
@@ -3,7 +3,11 @@ from typing import List, Dict
3
 
4
  from modules.config import scripture_configurations
5
  from modules.languages.transliterator import fn_transliterate
 
6
 
 
 
 
7
 
8
  class SanatanConfig:
9
  dbStorePath: str = "./chromadb-store"
@@ -97,6 +101,7 @@ class SanatanConfig:
97
  "chapter_name",
98
  "relative_path",
99
  "location",
 
100
  }
101
 
102
  config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
@@ -135,6 +140,13 @@ class SanatanConfig:
135
  if key in allowed_keys:
136
  canonical_doc[key] = resolve_field(field)
137
 
 
 
 
 
 
 
 
138
  # Add standard fields from config
139
  canonical_doc["scripture_name"] = config.get("name")
140
  canonical_doc["scripture_title"] = config.get("title")
@@ -168,6 +180,34 @@ class SanatanConfig:
168
  collection_name = config.get("collection_name")
169
  return collection_name
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
  if __name__ == "__main__":
173
  print(SanatanConfig.scriptures)
 
3
 
4
  from modules.config import scripture_configurations
5
  from modules.languages.transliterator import fn_transliterate
6
+ import logging
7
 
8
+ logging.basicConfig()
9
+ logger = logging.getLogger(__name__)
10
+ logger.setLevel(logging.INFO)
11
 
12
  class SanatanConfig:
13
  dbStorePath: str = "./chromadb-store"
 
101
  "chapter_name",
102
  "relative_path",
103
  "location",
104
+ "hierarchy"
105
  }
106
 
107
  config = next((s for s in self.scriptures if s["name"] == scripture_name), None)
 
140
  if key in allowed_keys:
141
  canonical_doc[key] = resolve_field(field)
142
 
143
+ # Ensure hierarchy is always a list for uniform processing in the script
144
+ if "hierarchy" not in canonical_doc or not canonical_doc["hierarchy"]:
145
+ # Default fallback to just chapter_name if no hierarchy defined
146
+ canonical_doc["hierarchy"] = [
147
+ {"type": "chapter", "name": canonical_doc.get("chapter_name", "General")}
148
+ ]
149
+
150
  # Add standard fields from config
151
  canonical_doc["scripture_name"] = config.get("name")
152
  canonical_doc["scripture_title"] = config.get("title")
 
180
  collection_name = config.get("collection_name")
181
  return collection_name
182
 
183
+ def get_hierarchy_structure(self, scripture_name: str) -> List[str]:
184
+ """
185
+ Dynamically determines the hierarchy keys (types) for a scripture
186
+ by inspecting its hierarchy mapping lambda.
187
+ """
188
+ config = self.get_scripture_by_name(scripture_name)
189
+ mapping = config.get("field_mapping", {})
190
+ hierarchy_fn = mapping.get("hierarchy")
191
+
192
+ if not hierarchy_fn or not callable(hierarchy_fn):
193
+ # Fallback if no hierarchy is defined
194
+ return ["chapter"]
195
+
196
+ # Dry run the lambda with a mock doc to extract the types.
197
+ # We provide a mock that returns a string for any key to ensure
198
+ # the lambda doesn't crash on .get() calls.
199
+ class MockDoc(dict):
200
+ def get(self, key, default=None):
201
+ return "mock_val"
202
+
203
+ try:
204
+ sample_hierarchy = hierarchy_fn(MockDoc())
205
+ # Extract 'type' from each level in the hierarchy, ignoring None
206
+ return [level["type"] for level in sample_hierarchy if level and "type" in level]
207
+ except Exception as e:
208
+ logger.error(f"Error detecting hierarchy for {scripture_name}: {e}")
209
+ return ["chapter"]
210
+
211
 
212
  if __name__ == "__main__":
213
  print(SanatanConfig.scriptures)
db.py CHANGED
@@ -1037,3 +1037,28 @@ class SanatanDatabase:
1037
  nalayiram_helper.delete_taniyan(
1038
  self.get_collection_cached("divya_prabandham")
1039
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
  nalayiram_helper.delete_taniyan(
1038
  self.get_collection_cached("divya_prabandham")
1039
  )
1040
+
1041
+ def fetch_all_metadata(self, collection_name: str):
1042
+ """
1043
+ Fetches all metadata entries for a given collection.
1044
+ This is used to build the Table of Contents hierarchy.
1045
+ """
1046
+ logger.info("fetch_all_metadata: Fetching all metadata for [%s]", collection_name)
1047
+ collection = self.get_collection_cached(name=collection_name)
1048
+
1049
+ # We only need 'metadatas' to build the TOC, not 'documents' or 'embeddings'
1050
+ # This keeps the payload small and fast.
1051
+ try:
1052
+ # Note: Chroma's .get() without a limit usually fetches everything,
1053
+ # but being explicit is safer for large collections.
1054
+ results = collection.get(
1055
+ include=["metadatas"]
1056
+ )
1057
+
1058
+ metadatas = results.get("metadatas", [])
1059
+ logger.info("fetch_all_metadata: Successfully retrieved %d metadata records", len(metadatas))
1060
+ return metadatas
1061
+
1062
+ except Exception as e:
1063
+ logger.error("Error in fetch_all_metadata for %s: %s", collection_name, e, exc_info=True)
1064
+ return []
modules/config/divya_prabandham.py CHANGED
@@ -16,7 +16,32 @@ divya_prabandham_config = {
16
  "unit_field": "verse",
17
  "field_mapping": {
18
  "text": "pasuram_ta",
19
- "title": lambda doc: f"{doc.get('prabandham_name','')} {doc.get('chapter','')}-{doc.get('decade','')}:{doc.get('position_in_chapter','')}",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  "location": "divya_desams",
21
  "word_by_word_native": "wbw_ta",
22
  "unit_index": "verse",
@@ -55,6 +80,19 @@ divya_prabandham_config = {
55
  ],
56
  )
57
  ),
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  },
59
  "metadata_fields": [
60
  {
 
16
  "unit_field": "verse",
17
  "field_mapping": {
18
  "text": "pasuram_ta",
19
+ "chapter_name": "prabandham_name",
20
+ "title": lambda doc: (
21
+ f"{doc.get('prabandham_name', '')}:"
22
+ + ".".join(
23
+ filter(
24
+ None,
25
+ [
26
+ (
27
+ str(doc.get("decade"))
28
+ if doc.get("decade") not in [None, -1, "-1"]
29
+ else None
30
+ ),
31
+ (
32
+ str(doc.get("chapter"))
33
+ if doc.get("chapter") not in [None, -1, "-1"]
34
+ else None
35
+ ),
36
+ (
37
+ str(doc.get("position_in_chapter"))
38
+ if doc.get("position_in_chapter") not in [None, -1, "-1"]
39
+ else None
40
+ ),
41
+ ],
42
+ )
43
+ )
44
+ ).strip(":"),
45
  "location": "divya_desams",
46
  "word_by_word_native": "wbw_ta",
47
  "unit_index": "verse",
 
80
  ],
81
  )
82
  ),
83
+ "hierarchy": lambda doc: [
84
+ {"type": "prabandham", "name": doc.get("prabandham_name")},
85
+ (
86
+ {"type": "decade", "name": f"Decade {doc.get('decade')}"}
87
+ if doc.get("decade") not in [None, -1, "-1"]
88
+ else None
89
+ ),
90
+ (
91
+ {"type": "padigam", "name": f"Padigam {doc.get('chapter')}"}
92
+ if doc.get("chapter") not in [None, -1, "-1"]
93
+ else None
94
+ ),
95
+ ],
96
  },
97
  "metadata_fields": [
98
  {
server.py CHANGED
@@ -3,6 +3,7 @@ import json
3
  import os
4
  from pathlib import Path
5
  import random
 
6
  import traceback
7
  from typing import Optional
8
  import uuid
@@ -700,4 +701,91 @@ async def get_translations():
700
  except FileNotFoundError:
701
  raise HTTPException(status_code=404, detail="translations.json not found")
702
  except json.JSONDecodeError:
703
- raise HTTPException(status_code=500, detail="Invalid JSON format")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import os
4
  from pathlib import Path
5
  import random
6
+ import re
7
  import traceback
8
  from typing import Optional
9
  import uuid
 
701
  except FileNotFoundError:
702
  raise HTTPException(status_code=404, detail="translations.json not found")
703
  except json.JSONDecodeError:
704
+ raise HTTPException(status_code=500, detail="Invalid JSON format")
705
+
706
+ import time
707
+
708
+ TOC_CACHE_DIR = Path("./cache/toc")
709
+ TOC_CACHE_DIR.mkdir(parents=True, exist_ok=True)
710
+
711
+ @router.get("/scripture/{scripture_name}/toc")
712
+ async def get_scripture_toc(scripture_name: str, force_refresh: bool = False):
713
+ cache_file = TOC_CACHE_DIR / f"{scripture_name}.json"
714
+
715
+ # 1. Check if cache exists and is fresh (e.g., less than 24h old)
716
+ if not force_refresh and cache_file.exists():
717
+ # Optional: check file age if you update DB frequently
718
+ with open(cache_file, "r", encoding="utf-8") as f:
719
+ return json.load(f)
720
+
721
+ config_service = SanatanConfig()
722
+ config = config_service.get_scripture_by_name(scripture_name)
723
+ db = SanatanDatabase()
724
+
725
+ # 2. Fetch all metadata (Fast)
726
+ records = db.fetch_all_metadata(config["collection_name"])
727
+
728
+ # 3. Build tree using a nested dictionary for O(1) lookups
729
+ # This avoids the slow next(...) calls and canonicalize_document
730
+ root_nodes = {}
731
+
732
+ for meta in records:
733
+ # Instead of canonicalizing the WHOLE doc, we only run the hierarchy lambda
734
+ # This is the secret to speed.
735
+ hierarchy_fn = config.get("field_mapping", {}).get("hierarchy")
736
+ if not hierarchy_fn or not callable(hierarchy_fn):
737
+ continue
738
+
739
+ hierarchy_data = hierarchy_fn(meta)
740
+ global_index = meta.get("_global_index")
741
+
742
+ curr_level = root_nodes
743
+ for entry in hierarchy_data:
744
+ if entry is None: continue
745
+
746
+ node_name = entry.get("name")
747
+ if not node_name: continue
748
+
749
+ if node_name not in curr_level:
750
+ curr_level[node_name] = {
751
+ "name": node_name,
752
+ "type": entry.get("type"),
753
+ "first_index": global_index,
754
+ "children": {}
755
+ }
756
+
757
+ # Update first index if earlier
758
+ if global_index is not None:
759
+ if curr_level[node_name]["first_index"] is None or global_index < curr_level[node_name]["first_index"]:
760
+ curr_level[node_name]["first_index"] = global_index
761
+
762
+ curr_level = curr_level[node_name]["children"]
763
+
764
+ # 4. Convert nested dicts back to lists and sort
765
+ def natural_sort_key(s):
766
+ """
767
+ Helper to sort strings containing numbers naturally.
768
+ 'Chapter 2' < 'Chapter 10'
769
+ """
770
+ return [int(text) if text.isdigit() else text.lower()
771
+ for text in re.split('([0-9]+)', str(s))]
772
+
773
+ def dict_to_sorted_list(node_dict):
774
+ nodes = []
775
+ for key in node_dict:
776
+ node = node_dict[key]
777
+ node["children"] = dict_to_sorted_list(node["children"])
778
+ nodes.append(node)
779
+
780
+ # SORT BY NAME INSTEAD OF INDEX
781
+ nodes.sort(key=lambda x: natural_sort_key(x["name"]))
782
+ return nodes
783
+
784
+ final_toc = dict_to_sorted_list(root_nodes)
785
+ response_data = {"toc": final_toc}
786
+
787
+ # 5. Save to cache
788
+ with open(cache_file, "w", encoding="utf-8") as f:
789
+ json.dump(response_data, f, ensure_ascii=False)
790
+
791
+ return response_data