sanatan_ai / nalayiram_helper.py
vikramvasudevan's picture
Upload folder using huggingface_hub
c3c8276 verified
raw
history blame
5.73 kB
import json
from dataclasses import dataclass
from collections import defaultdict
import logging
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
@dataclass
class Pasuram:
prabandham_code: str
azhwar_name: str
prabandham_name: str
def get_standardized_prabandham_names() -> list[Pasuram]:
"""
Get a list of prabandham names along with the azhwars who authored them in divya_prabandham,
sorted by the prabandham name (3rd field, index 2).
"""
with open("./data/azhwars.json", "r", encoding="utf-8") as f:
azhwars = json.load(f)
header = azhwars[0]
rows = azhwars[1:]
# Sort by 3rd field (index 2)
rows.sort(key=lambda row: row[2])
final_azhwars = [Pasuram(**dict(zip(header, row))) for row in rows]
return final_azhwars
def get_standardized_azhwar_names() -> list[str]:
"""
Get a list of azhwar names along with the pasurams they have authored in divya_prabandham
"""
with open("./data/azhwars.json", "r", encoding="utf-8") as f:
azhwars = json.load(f)
header = azhwars[0]
rows = azhwars[1:]
final_azhwars = [row[1] for row in rows] ## 2nd field is the azhwar name
return sorted(set(final_azhwars))
def get_standardized_divya_desam_names() -> list[str]:
"""
Get a list of divya desam names in divya_prabandham
"""
with open("./data/divya_desams.json", "r", encoding="utf-8") as f:
divya_desams = json.load(f) # FIXED
selected_fields = [
"title",
"other_names",
"name_ta",
"alwars",
"area",
"state",
"thirukolam",
"direction",
"sampradayam",
"divya_desam",
]
data = [
{key: row[key] for key in selected_fields if key in row}
for row in divya_desams["pageProps"]["hits"]
]
return sorted(set([row["title"] for row in data]))
def reorder_taniyan(collection):
logger.info("reorder_taniyan: started")
# Fetch all docs with ids + metadatas
data = collection.get(include=["metadatas"])
ids = data.get("ids", [])
metas = data.get("metadatas", [])
if not ids or not metas:
logger.warning("reorder_taniyan: no data found in collection")
return
# sort globally by current _global_index
records = sorted(
[(i, m) for i, m in enumerate(metas)],
key=lambda x: x[1].get("_global_index", float("inf")),
)
# group by prabandham_code
grouped = defaultdict(list)
for i, meta in records:
prabandham = meta.get("prabandham_code")
if prabandham:
grouped[prabandham].append((i, meta))
updates = []
global_counter = 1 # running _global_index across the collection
for prabandham, items in grouped.items():
taniyan_items = [
(i, m) for i, m in items if m.get("section_type", "").startswith("taniyan")
]
non_taniyan_items = [
(i, m)
for i, m in items
if not m.get("section_type", "").startswith("taniyan")
]
if not taniyan_items and not non_taniyan_items:
continue
# sort both groups by original _global_index
taniyan_items.sort(key=lambda x: x[1]["_global_index"])
non_taniyan_items.sort(key=lambda x: x[1]["_global_index"])
# --- taniyans first (verse starts from 1) ---
for verse_no, (i, meta) in enumerate(taniyan_items, start=1):
updates.append(
{
"id": ids[i],
"metadata": {
**meta,
"_global_index": global_counter,
"verse": verse_no,
},
}
)
global_counter += 1
# --- non-taniyans continue from their base verse ---
if non_taniyan_items:
base_verse = min(m["verse"] for _, m in non_taniyan_items)
for offset, (i, meta) in enumerate(non_taniyan_items):
updates.append(
{
"id": ids[i],
"metadata": {
**meta,
"_global_index": global_counter,
"verse": base_verse + offset,
},
}
)
global_counter += 1
if updates:
logger.info("reorder_taniyan: updating %d records...", len(updates))
collection.update(
ids=[u["id"] for u in updates],
metadatas=[u["metadata"] for u in updates],
)
logger.info("reorder_taniyan: update complete.")
else:
logger.info("reorder_taniyan: nothing to update")
logger.info("reorder_taniyan: finished")
def delete_taniyan(collection):
logger.info("delete_taniyan: started")
# Fetch all docs (only ids + metadata needed)
data = collection.get(include=["metadatas"])
ids = data["ids"]
metas = data["metadatas"]
# Collect ids where section_type starts with "taniyan"
taniyan_ids = [
ids[i] for i, meta in enumerate(metas)
if meta.get("section_type", "").startswith("taniyan")
]
if taniyan_ids:
logger.info("delete_taniyan: Deleting %d taniyan records...", len(taniyan_ids))
collection.delete(ids=taniyan_ids)
logger.info("delete_taniyan: Deleted %d taniyan records", len(taniyan_ids))
else:
logger.info("delete_taniyan: No taniyan records found")
logger.info("delete_taniyan: finished")
if __name__ == "__main__":
logger.info(get_standardized_azhwar_names())