File size: 8,982 Bytes
22fd41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae27cd
22fd41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae27cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22fd41f
 
7ae27cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22fd41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae27cd
22fd41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ae27cd
 
 
 
 
 
 
 
 
 
22fd41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
"""
ingest.py β€” Step 1: Build the vector knowledge base from religious PDFs.

Run this ONCE before starting the app:
    python ingest.py

It will:
1. Load all PDFs from the ./books/ directory
2. Split them into overlapping semantic chunks
3. Embed each chunk using NVIDIA's llama-nemotron embedding model
4. Persist everything into a local ChromaDB vector store
"""

import os
import sys
from pathlib import Path
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_chroma import Chroma
import re

load_dotenv()

# ─── Configuration ────────────────────────────────────────────────────────────

BOOKS_DIR = Path("./books")
CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "sacred_texts")
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

# Mapping of filename keywords β†’ friendly book name stored in metadata
BOOK_NAME_MAP = {
    "gita": "Bhagavad Gita",
    "bhagavad": "Bhagavad Gita",
    "quran": "Quran",
    "koran": "Quran",
    "bible": "Bible",
    "testament": "Bible",
    "granth": "Guru Granth Sahib",    # ← ADD
    "guru": "Guru Granth Sahib",      # ← ADD
}
# Chunk settings β€” tuned for religious texts (verses are short)
CHUNK_SIZE = 800       # characters per chunk
CHUNK_OVERLAP = 150    # overlap to preserve verse context across boundaries


# Regex patterns for different scriptures
VERSE_PATTERNS = {
    "Bhagavad Gita": r"(?:Verse\s+)?(\d+\.\d+)",          # Matches 2.47 or Verse 2.47
    "Quran": r"(\d+:\d+)",                                # Matches 2:286
    "Bible": r"(\d+\s+)?[A-Z][a-z]+\s+\d+:\d+",           # Matches John 3:16 or 1 Cor 13:4
    "Guru Granth Sahib": r"(?:Ang\s+)?(\d+)"              # Matches Ang 1 or 1
}

# Patterns to identify structure in the text
STRUCTURE_PATTERNS = {
    "Bhagavad Gita": r"(\d+)\.(\d+)",       # Matches 2.47 (Chapter.Verse)
    "Quran": r"(\d+):(\d+)",               # Matches 2:186 (Surah:Verse)
    "Bible": r"(\d+):(\d+)",               # Matches 3:16 (Chapter:Verse)
    "Guru Granth Sahib": r"Ang\s+(\d+)"    # Matches Ang 1
}

# ─── Helpers ──────────────────────────────────────────────────────────────────

def parse_structure(text, book_name):
    pattern = STRUCTURE_PATTERNS.get(book_name)
    if not pattern:
        return {}
    
    match = re.search(pattern, text)
    if match:
        if book_name == "Guru Granth Sahib":
            return {"ang": int(match.group(1))}
        return {"chapter": int(match.group(1)), "verse": int(match.group(2))}
    return {}

def extract_verse(text: str, book_name: str) -> str:
    """Extracts a verse reference from a text chunk based on the book."""
    pattern = VERSE_PATTERNS.get(book_name)
    if not pattern:
        return "Unknown"
    
    match = re.search(pattern, text)
    return match.group(0) if match else "General Context"

def detect_book_name(filename: str) -> str:
    """Infer the book's display name from its filename."""
    name_lower = filename.lower()
    for keyword, book_name in BOOK_NAME_MAP.items():
        if keyword in name_lower:
            return book_name
    # Fallback: use the filename stem, title-cased
    return Path(filename).stem.replace("_", " ").title()


def load_pdf(pdf_path: Path) -> list:
    """
    Load a PDF using PyMuPDF (preferred) or PyPDF as fallback.
    Returns a list of LangChain Document objects.
    """
    try:
        loader = PyMuPDFLoader(str(pdf_path))
        print(f"  πŸ“– Loading with PyMuPDF: {pdf_path.name}")
    except Exception:
        loader = PyPDFLoader(str(pdf_path))
        print(f"  πŸ“– Loading with PyPDF: {pdf_path.name}")

    docs = loader.load()
    print(f"     β†’ {len(docs)} pages loaded")
    return docs


def tag_documents(docs: list, book_name: str, source_file: str) -> list:
    """
    Enrich each document's metadata with:
    - book: display name (e.g. "Bhagavad Gita")
    - source_file: original filename
    """
    for doc in docs:
        doc.metadata["book"] = book_name
        doc.metadata["verse_citation"] = extract_verse(doc.page_content, book_name)
        doc.metadata["source_file"] = source_file
        # Keep the page number if already present from the loader
        if "page" not in doc.metadata:
            doc.metadata["page"] = 0
    return docs


# ─── Main Ingestion ───────────────────────────────────────────────────────────

def ingest():
    if not NVIDIA_API_KEY:
        print("❌  NVIDIA_API_KEY not set. Add it to your .env file.")
        sys.exit(1)

    if not BOOKS_DIR.exists():
        print(f"❌  Books directory not found: {BOOKS_DIR.resolve()}")
        print("    Create a ./books/ folder and add your PDFs there.")
        sys.exit(1)

    pdf_files = list(BOOKS_DIR.glob("*.pdf"))
    if not pdf_files:
        print(f"❌  No PDF files found in {BOOKS_DIR.resolve()}")
        sys.exit(1)

    print(f"\nπŸ•ŠοΈ  Sacred Texts RAG β€” Ingestion Pipeline")
    print(f"{'─' * 50}")
    print(f"πŸ“‚  Books directory : {BOOKS_DIR.resolve()}")
    print(f"πŸ’Ύ  ChromaDB path   : {Path(CHROMA_DB_PATH).resolve()}")
    print(f"πŸ“š  PDFs found      : {len(pdf_files)}")
    print(f"{'─' * 50}\n")

    # ── Step 1: Load all PDFs ────────────────────────────────────────────────
    all_docs = []
    for pdf_path in pdf_files:
        book_name = detect_book_name(pdf_path.name)
        print(f"πŸ“•  {book_name}")
        raw_docs = load_pdf(pdf_path)
        tagged_docs = tag_documents(raw_docs, book_name, pdf_path.name)
        all_docs.extend(tagged_docs)
        print(f"     βœ…  Tagged as '{book_name}'\n")

    print(f"πŸ“„  Total pages loaded: {len(all_docs)}")

    # ── Step 2: Split into chunks ────────────────────────────────────────────
    print(f"\nβœ‚οΈ   Splitting into chunks (size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})...")
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ". ", " ", ""],  # Respect paragraph/verse boundaries
    )
    chunks = splitter.split_documents(all_docs)
    print(f"     β†’ {len(chunks)} chunks created")
    
    # Add verse citations to chunk metadata for better source attribution
    print(f"🏷️   Parsing structure (chapters/verses) for {len(chunks)} chunks...")
    for chunk in chunks:
        # Use the parse_structure function you defined
        structure = parse_structure(chunk.page_content, chunk.metadata["book"])
        # Update the chunk metadata so it is saved in ChromaDB
        chunk.metadata.update(structure)

    print(f"     β†’ {len(chunks)} chunks created and tagged")

    # ── Step 3: Embed & store ────────────────────────────────────────────────
    print(f"\nπŸ”’  Initialising NVIDIA embedding model (llama-nemotron-embed-vl-1b-v2)...")
    embeddings = NVIDIAEmbeddings(
        model="nvidia/llama-nemotron-embed-vl-1b-v2",
        api_key=NVIDIA_API_KEY,
        truncate="NONE",
    )

    print(f"πŸ’Ύ  Building ChromaDB vector store β€” this may take a few minutes...")
    print(f"     (Embedding {len(chunks)} chunks...)\n")

    # Process in batches to avoid rate limits
    BATCH_SIZE = 100
    vector_store = None

    for i in range(0, len(chunks), BATCH_SIZE):
        batch = chunks[i : i + BATCH_SIZE]
        batch_num = i // BATCH_SIZE + 1
        total_batches = (len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE
        print(f"  Batch {batch_num}/{total_batches}: embedding {len(batch)} chunks...")

        if vector_store is None:
            vector_store = Chroma.from_documents(
                documents=batch,
                embedding=embeddings,
                persist_directory=CHROMA_DB_PATH,
                collection_name=COLLECTION_NAME,
            )
        else:
            vector_store.add_documents(batch)

    print(f"\n{'─' * 50}")
    print(f"βœ…  Ingestion complete!")
    print(f"    πŸ“¦  {len(chunks)} chunks stored in ChromaDB")
    print(f"    πŸ“‚  Location: {Path(CHROMA_DB_PATH).resolve()}")
    print(f"\nπŸ‘‰  Now run: python app.py")
    print(f"{'─' * 50}\n")


if __name__ == "__main__":
    ingest()