Spaces:
Sleeping
Sleeping
| import asyncio | |
| import lancedb | |
| from lancedb.pydantic import LanceModel, Vector | |
| from datasets import load_dataset | |
| import sys | |
| import os | |
| # Add project root to path BEFORE importing app modules | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from app.services.llm import llm_service | |
| class BibleVerse(LanceModel): | |
| reference: str | |
| text: str | |
| vector: Vector(128) # Default mock dimension | |
| DB_PATH = "data/lancedb_storage" | |
| TABLE_NAME = "bible_verses" | |
| async def ingest(): | |
| print("--- Starting Bible Ingestion ---") | |
| # 1. Connect to DB | |
| if not os.path.exists("data"): | |
| os.makedirs("data") | |
| db = lancedb.connect(DB_PATH) | |
| # 2. Load Dataset | |
| print("Loading 'odunola/bible-reference-sentence-pair' (streaming)...") | |
| ds = load_dataset("odunola/bible-reference-sentence-pair", split="train", streaming=True) | |
| # For MVP, let's take first 500 verses | |
| verses = [] | |
| print("Fetching first 500 items...") | |
| for item in ds: | |
| verses.append(item) | |
| if len(verses) >= 500: | |
| break | |
| print(f"Processing {len(verses)} verses...") | |
| data_to_insert = [] | |
| # Correct keys found via debug: | |
| ref_key = 'references' | |
| text_key = 'headers' | |
| print(f"Using Ref Key: {ref_key}, Text Key: {text_key}") | |
| # Detect dimension | |
| first_text = verses[0][text_key] | |
| first_vec = await llm_service.get_embedding(first_text) | |
| dim = len(first_vec) | |
| print(f"Detected embedding dimension: {dim}") | |
| for i, item in enumerate(verses): | |
| ref = item.get(ref_key, "Unknown") | |
| text = item.get(text_key, "") | |
| if not text: | |
| continue | |
| vec = await llm_service.get_embedding(text) | |
| data_to_insert.append({ | |
| "reference": ref, | |
| "text": text, | |
| "vector": vec | |
| }) | |
| if (i+1) % 50 == 0: | |
| print(f"Processed {i+1}...") | |
| print("Writing to LanceDB...") | |
| db.create_table(TABLE_NAME, data=data_to_insert, mode="overwrite") | |
| print("Ingestion Complete!") | |
| if __name__ == "__main__": | |
| asyncio.run(ingest()) | |