Spaces:

vikramvasudevan
/

sanatan_ai

Running on CPU Upgrade

App Files Files Community

sanatan_ai / nalayiram_helper.py

vikramvasudevan

Upload folder using huggingface_hub

c3c8276 verified 3 months ago

raw

history blame

5.73 kB

	import json
	from dataclasses import dataclass
	from collections import defaultdict
	import logging

	logging.basicConfig()
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)


	@dataclass
	class Pasuram:
	prabandham_code: str
	azhwar_name: str
	prabandham_name: str


	def get_standardized_prabandham_names() -> list[Pasuram]:
	"""
	Get a list of prabandham names along with the azhwars who authored them in divya_prabandham,
	sorted by the prabandham name (3rd field, index 2).
	"""
	with open("./data/azhwars.json", "r", encoding="utf-8") as f:
	azhwars = json.load(f)
	header = azhwars[0]
	rows = azhwars[1:]

	# Sort by 3rd field (index 2)
	rows.sort(key=lambda row: row[2])

	final_azhwars = [Pasuram(**dict(zip(header, row))) for row in rows]

	return final_azhwars


	def get_standardized_azhwar_names() -> list[str]:
	"""
	Get a list of azhwar names along with the pasurams they have authored in divya_prabandham
	"""
	with open("./data/azhwars.json", "r", encoding="utf-8") as f:
	azhwars = json.load(f)
	header = azhwars[0]
	rows = azhwars[1:]
	final_azhwars = [row[1] for row in rows] ## 2nd field is the azhwar name

	return sorted(set(final_azhwars))


	def get_standardized_divya_desam_names() -> list[str]:
	"""
	Get a list of divya desam names in divya_prabandham
	"""
	with open("./data/divya_desams.json", "r", encoding="utf-8") as f:
	divya_desams = json.load(f) # FIXED

	selected_fields = [
	"title",
	"other_names",
	"name_ta",
	"alwars",
	"area",
	"state",
	"thirukolam",
	"direction",
	"sampradayam",
	"divya_desam",
	]
	data = [
	{key: row[key] for key in selected_fields if key in row}
	for row in divya_desams["pageProps"]["hits"]
	]
	return sorted(set([row["title"] for row in data]))

	def reorder_taniyan(collection):
	logger.info("reorder_taniyan: started")

	# Fetch all docs with ids + metadatas
	data = collection.get(include=["metadatas"])
	ids = data.get("ids", [])
	metas = data.get("metadatas", [])

	if not ids or not metas:
	logger.warning("reorder_taniyan: no data found in collection")
	return

	# sort globally by current _global_index
	records = sorted(
	[(i, m) for i, m in enumerate(metas)],
	key=lambda x: x[1].get("_global_index", float("inf")),
	)

	# group by prabandham_code
	grouped = defaultdict(list)
	for i, meta in records:
	prabandham = meta.get("prabandham_code")
	if prabandham:
	grouped[prabandham].append((i, meta))

	updates = []
	global_counter = 1 # running _global_index across the collection

	for prabandham, items in grouped.items():
	taniyan_items = [
	(i, m) for i, m in items if m.get("section_type", "").startswith("taniyan")
	]
	non_taniyan_items = [
	(i, m)
	for i, m in items
	if not m.get("section_type", "").startswith("taniyan")
	]

	if not taniyan_items and not non_taniyan_items:
	continue

	# sort both groups by original _global_index
	taniyan_items.sort(key=lambda x: x[1]["_global_index"])
	non_taniyan_items.sort(key=lambda x: x[1]["_global_index"])

	# --- taniyans first (verse starts from 1) ---
	for verse_no, (i, meta) in enumerate(taniyan_items, start=1):
	updates.append(
	{
	"id": ids[i],
	"metadata": {
	**meta,
	"_global_index": global_counter,
	"verse": verse_no,
	},
	}
	)
	global_counter += 1

	# --- non-taniyans continue from their base verse ---
	if non_taniyan_items:
	base_verse = min(m["verse"] for _, m in non_taniyan_items)
	for offset, (i, meta) in enumerate(non_taniyan_items):
	updates.append(
	{
	"id": ids[i],
	"metadata": {
	**meta,
	"_global_index": global_counter,
	"verse": base_verse + offset,
	},
	}
	)
	global_counter += 1

	if updates:
	logger.info("reorder_taniyan: updating %d records...", len(updates))
	collection.update(
	ids=[u["id"] for u in updates],
	metadatas=[u["metadata"] for u in updates],
	)
	logger.info("reorder_taniyan: update complete.")
	else:
	logger.info("reorder_taniyan: nothing to update")

	logger.info("reorder_taniyan: finished")


	def delete_taniyan(collection):
	logger.info("delete_taniyan: started")

	# Fetch all docs (only ids + metadata needed)
	data = collection.get(include=["metadatas"])
	ids = data["ids"]
	metas = data["metadatas"]

	# Collect ids where section_type starts with "taniyan"
	taniyan_ids = [
	ids[i] for i, meta in enumerate(metas)
	if meta.get("section_type", "").startswith("taniyan")
	]

	if taniyan_ids:
	logger.info("delete_taniyan: Deleting %d taniyan records...", len(taniyan_ids))
	collection.delete(ids=taniyan_ids)
	logger.info("delete_taniyan: Deleted %d taniyan records", len(taniyan_ids))
	else:
	logger.info("delete_taniyan: No taniyan records found")

	logger.info("delete_taniyan: finished")


	if __name__ == "__main__":
	logger.info(get_standardized_azhwar_names())