File size: 1,740 Bytes
4ce73a8
 
49d794f
4ce73a8
 
 
5f93db1
 
4ce73a8
 
 
 
 
 
 
 
 
 
5f93db1
 
 
 
4ce73a8
 
 
 
 
 
 
49d794f
 
4ce73a8
 
 
 
49d794f
 
4ce73a8
 
5f93db1
4ce73a8
 
 
 
49d794f
4ce73a8
49d794f
 
 
 
 
4ce73a8
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import json
import chromadb
from tqdm import tqdm
from embeddings import get_embedding

# ===== SETTINGS =====
JSON_FILE = "./output/sri_stavam/sri_stavam_detailed.json"  # your JSON file path
COLLECTION_NAME = "sri_stavam"

# Load the JSON data
with open(JSON_FILE, "r", encoding="utf-8") as f:
    slokas = json.load(f)

# Start Chroma DB client (can persist to disk or run in-memory)
client = chromadb.PersistentClient(path="./chromadb-store")  # persistent
# OR: client = chromadb.Client()  # in-memory only

# Get or create the collection
try:
    client.delete_collection(name=COLLECTION_NAME)
except:
    pass
collection = client.get_or_create_collection(name=COLLECTION_NAME)

# Prepare and insert each sloka
ids = []
documents = []
embeddings = []
metadatas = []
for id, sloka in tqdm(enumerate(slokas)):
    sloka_num = sloka.get("verse", 0)

    # Combine fields into one searchable text blob
    text_blob = (
        f"Sloka {sloka_num}\n\n"
        f"Translation:\n{sloka['translation']}\n\n"
        f"Commentary:\n{sloka['commentary']}\n\n"
    )

    ids.append(f"sloka-{id}")
    documents.append(text_blob)
    embeddings.append(get_embedding(text=text_blob))
    metadatas.append(
        {
            "_global_index": id + 1,
            "sloka_number": sloka_num,
            "meaning_short": sloka["translation"],
            "chapter": sloka["chapter"],
            "sanskrit": sloka["sanskrit"],
            "transliteration": sloka["transliteration"],
            "commentary": sloka["commentary"],
        }
    )

# Add to Chroma collection
collection.add(ids=ids, documents=documents, embeddings=embeddings, metadatas=metadatas)

print(f"Inserted {len(documents)} slokas into collection '{COLLECTION_NAME}'.")