File size: 2,144 Bytes
5e0532d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import asyncio
import lancedb
from lancedb.pydantic import LanceModel, Vector
from datasets import load_dataset
import sys
import os

# Add project root to path BEFORE importing app modules
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from app.services.llm import llm_service

class BibleVerse(LanceModel):
    reference: str
    text: str
    vector: Vector(128) # Default mock dimension

DB_PATH = "data/lancedb_storage"
TABLE_NAME = "bible_verses"

async def ingest():
    print("--- Starting Bible Ingestion ---")
    
    # 1. Connect to DB
    if not os.path.exists("data"):
        os.makedirs("data")
    db = lancedb.connect(DB_PATH)
    
    # 2. Load Dataset
    print("Loading 'odunola/bible-reference-sentence-pair' (streaming)...")
    ds = load_dataset("odunola/bible-reference-sentence-pair", split="train", streaming=True)
    
    # For MVP, let's take first 500 verses
    verses = []
    print("Fetching first 500 items...")
    for item in ds:
        verses.append(item)
        if len(verses) >= 500:
            break
    
    print(f"Processing {len(verses)} verses...")
    
    data_to_insert = []
    
    # Correct keys found via debug:
    ref_key = 'references'
    text_key = 'headers' 

    print(f"Using Ref Key: {ref_key}, Text Key: {text_key}")
    
    # Detect dimension
    first_text = verses[0][text_key]
    first_vec = await llm_service.get_embedding(first_text)
    dim = len(first_vec)
    print(f"Detected embedding dimension: {dim}")

    for i, item in enumerate(verses):
        ref = item.get(ref_key, "Unknown")
        text = item.get(text_key, "")
        
        if not text:
            continue

        vec = await llm_service.get_embedding(text)
        
        data_to_insert.append({
            "reference": ref,
            "text": text,
            "vector": vec
        })
        
        if (i+1) % 50 == 0:
            print(f"Processed {i+1}...")

    print("Writing to LanceDB...")
    db.create_table(TABLE_NAME, data=data_to_insert, mode="overwrite")
    print("Ingestion Complete!")

if __name__ == "__main__":
    asyncio.run(ingest())