Spaces:
Sleeping
Sleeping
File size: 2,144 Bytes
5e0532d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import asyncio
import lancedb
from lancedb.pydantic import LanceModel, Vector
from datasets import load_dataset
import sys
import os
# Add project root to path BEFORE importing app modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app.services.llm import llm_service
class BibleVerse(LanceModel):
reference: str
text: str
vector: Vector(128) # Default mock dimension
DB_PATH = "data/lancedb_storage"
TABLE_NAME = "bible_verses"
async def ingest():
print("--- Starting Bible Ingestion ---")
# 1. Connect to DB
if not os.path.exists("data"):
os.makedirs("data")
db = lancedb.connect(DB_PATH)
# 2. Load Dataset
print("Loading 'odunola/bible-reference-sentence-pair' (streaming)...")
ds = load_dataset("odunola/bible-reference-sentence-pair", split="train", streaming=True)
# For MVP, let's take first 500 verses
verses = []
print("Fetching first 500 items...")
for item in ds:
verses.append(item)
if len(verses) >= 500:
break
print(f"Processing {len(verses)} verses...")
data_to_insert = []
# Correct keys found via debug:
ref_key = 'references'
text_key = 'headers'
print(f"Using Ref Key: {ref_key}, Text Key: {text_key}")
# Detect dimension
first_text = verses[0][text_key]
first_vec = await llm_service.get_embedding(first_text)
dim = len(first_vec)
print(f"Detected embedding dimension: {dim}")
for i, item in enumerate(verses):
ref = item.get(ref_key, "Unknown")
text = item.get(text_key, "")
if not text:
continue
vec = await llm_service.get_embedding(text)
data_to_insert.append({
"reference": ref,
"text": text,
"vector": vec
})
if (i+1) % 50 == 0:
print(f"Processed {i+1}...")
print("Writing to LanceDB...")
db.create_table(TABLE_NAME, data=data_to_insert, mode="overwrite")
print("Ingestion Complete!")
if __name__ == "__main__":
asyncio.run(ingest())
|