import asyncio import lancedb from lancedb.pydantic import LanceModel, Vector from datasets import load_dataset import sys import os # Add project root to path BEFORE importing app modules sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app.services.llm import llm_service class BibleVerse(LanceModel): reference: str text: str vector: Vector(128) # Default mock dimension DB_PATH = "data/lancedb_storage" TABLE_NAME = "bible_verses" async def ingest(): print("--- Starting Bible Ingestion ---") # 1. Connect to DB if not os.path.exists("data"): os.makedirs("data") db = lancedb.connect(DB_PATH) # 2. Load Dataset print("Loading 'odunola/bible-reference-sentence-pair' (streaming)...") ds = load_dataset("odunola/bible-reference-sentence-pair", split="train", streaming=True) # For MVP, let's take first 500 verses verses = [] print("Fetching first 500 items...") for item in ds: verses.append(item) if len(verses) >= 500: break print(f"Processing {len(verses)} verses...") data_to_insert = [] # Correct keys found via debug: ref_key = 'references' text_key = 'headers' print(f"Using Ref Key: {ref_key}, Text Key: {text_key}") # Detect dimension first_text = verses[0][text_key] first_vec = await llm_service.get_embedding(first_text) dim = len(first_vec) print(f"Detected embedding dimension: {dim}") for i, item in enumerate(verses): ref = item.get(ref_key, "Unknown") text = item.get(text_key, "") if not text: continue vec = await llm_service.get_embedding(text) data_to_insert.append({ "reference": ref, "text": text, "vector": vec }) if (i+1) % 50 == 0: print(f"Processed {i+1}...") print("Writing to LanceDB...") db.create_table(TABLE_NAME, data=data_to_insert, mode="overwrite") print("Ingestion Complete!") if __name__ == "__main__": asyncio.run(ingest())