vedam_ai / json_loader.py
vikramvasudevan's picture
Upload folder using huggingface_hub
49d794f verified
import json
import chromadb
from tqdm import tqdm
from embeddings import get_embedding
# ===== SETTINGS =====
JSON_FILE = "./output/sri_stavam/sri_stavam_detailed.json" # your JSON file path
COLLECTION_NAME = "sri_stavam"
# Load the JSON data
with open(JSON_FILE, "r", encoding="utf-8") as f:
slokas = json.load(f)
# Start Chroma DB client (can persist to disk or run in-memory)
client = chromadb.PersistentClient(path="./chromadb-store") # persistent
# OR: client = chromadb.Client() # in-memory only
# Get or create the collection
try:
client.delete_collection(name=COLLECTION_NAME)
except:
pass
collection = client.get_or_create_collection(name=COLLECTION_NAME)
# Prepare and insert each sloka
ids = []
documents = []
embeddings = []
metadatas = []
for id, sloka in tqdm(enumerate(slokas)):
sloka_num = sloka.get("verse", 0)
# Combine fields into one searchable text blob
text_blob = (
f"Sloka {sloka_num}\n\n"
f"Translation:\n{sloka['translation']}\n\n"
f"Commentary:\n{sloka['commentary']}\n\n"
)
ids.append(f"sloka-{id}")
documents.append(text_blob)
embeddings.append(get_embedding(text=text_blob))
metadatas.append(
{
"_global_index": id + 1,
"sloka_number": sloka_num,
"meaning_short": sloka["translation"],
"chapter": sloka["chapter"],
"sanskrit": sloka["sanskrit"],
"transliteration": sloka["transliteration"],
"commentary": sloka["commentary"],
}
)
# Add to Chroma collection
collection.add(ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas)
print(f"Inserted {len(documents)} slokas into collection '{COLLECTION_NAME}'.")