""" ingest.py — Step 1: Build the vector knowledge base from religious PDFs. Run this ONCE before starting the app: python ingest.py It will: 1. Load all PDFs from the ./books/ directory 2. Split them into overlapping semantic chunks 3. Embed each chunk using NVIDIA's llama-nemotron embedding model 4. Persist everything into a local ChromaDB vector store """ import os import sys from pathlib import Path from dotenv import load_dotenv from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings from langchain_chroma import Chroma import re load_dotenv() # ─── Configuration ──────────────────────────────────────────────────────────── BOOKS_DIR = Path("./books") CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db") COLLECTION_NAME = os.getenv("COLLECTION_NAME", "sacred_texts") NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY") # Mapping of filename keywords → friendly book name stored in metadata BOOK_NAME_MAP = { "gita": "Bhagavad Gita", "bhagavad": "Bhagavad Gita", "quran": "Quran", "koran": "Quran", "bible": "Bible", "testament": "Bible", "granth": "Guru Granth Sahib", # ← ADD "guru": "Guru Granth Sahib", # ← ADD } # Chunk settings — tuned for religious texts (verses are short) CHUNK_SIZE = 800 # characters per chunk CHUNK_OVERLAP = 150 # overlap to preserve verse context across boundaries # Regex patterns for different scriptures VERSE_PATTERNS = { "Bhagavad Gita": r"(?:Verse\s+)?(\d+\.\d+)", # Matches 2.47 or Verse 2.47 "Quran": r"(\d+:\d+)", # Matches 2:286 "Bible": r"(\d+\s+)?[A-Z][a-z]+\s+\d+:\d+", # Matches John 3:16 or 1 Cor 13:4 "Guru Granth Sahib": r"(?:Ang\s+)?(\d+)" # Matches Ang 1 or 1 } # Patterns to identify structure in the text STRUCTURE_PATTERNS = { "Bhagavad Gita": r"(\d+)\.(\d+)", # Matches 2.47 (Chapter.Verse) "Quran": r"(\d+):(\d+)", # Matches 2:186 (Surah:Verse) "Bible": r"(\d+):(\d+)", # Matches 3:16 (Chapter:Verse) "Guru Granth Sahib": r"Ang\s+(\d+)" # Matches Ang 1 } # ─── Helpers ────────────────────────────────────────────────────────────────── def parse_structure(text, book_name): pattern = STRUCTURE_PATTERNS.get(book_name) if not pattern: return {} match = re.search(pattern, text) if match: if book_name == "Guru Granth Sahib": return {"ang": int(match.group(1))} return {"chapter": int(match.group(1)), "verse": int(match.group(2))} return {} def extract_verse(text: str, book_name: str) -> str: """Extracts a verse reference from a text chunk based on the book.""" pattern = VERSE_PATTERNS.get(book_name) if not pattern: return "Unknown" match = re.search(pattern, text) return match.group(0) if match else "General Context" def detect_book_name(filename: str) -> str: """Infer the book's display name from its filename.""" name_lower = filename.lower() for keyword, book_name in BOOK_NAME_MAP.items(): if keyword in name_lower: return book_name # Fallback: use the filename stem, title-cased return Path(filename).stem.replace("_", " ").title() def load_pdf(pdf_path: Path) -> list: """ Load a PDF using PyMuPDF (preferred) or PyPDF as fallback. Returns a list of LangChain Document objects. """ try: loader = PyMuPDFLoader(str(pdf_path)) print(f" 📖 Loading with PyMuPDF: {pdf_path.name}") except Exception: loader = PyPDFLoader(str(pdf_path)) print(f" 📖 Loading with PyPDF: {pdf_path.name}") docs = loader.load() print(f" → {len(docs)} pages loaded") return docs def tag_documents(docs: list, book_name: str, source_file: str) -> list: """ Enrich each document's metadata with: - book: display name (e.g. "Bhagavad Gita") - source_file: original filename """ for doc in docs: doc.metadata["book"] = book_name doc.metadata["verse_citation"] = extract_verse(doc.page_content, book_name) doc.metadata["source_file"] = source_file # Keep the page number if already present from the loader if "page" not in doc.metadata: doc.metadata["page"] = 0 return docs # ─── Main Ingestion ─────────────────────────────────────────────────────────── def ingest(): if not NVIDIA_API_KEY: print("❌ NVIDIA_API_KEY not set. Add it to your .env file.") sys.exit(1) if not BOOKS_DIR.exists(): print(f"❌ Books directory not found: {BOOKS_DIR.resolve()}") print(" Create a ./books/ folder and add your PDFs there.") sys.exit(1) pdf_files = list(BOOKS_DIR.glob("*.pdf")) if not pdf_files: print(f"❌ No PDF files found in {BOOKS_DIR.resolve()}") sys.exit(1) print(f"\n🕊️ Sacred Texts RAG — Ingestion Pipeline") print(f"{'─' * 50}") print(f"📂 Books directory : {BOOKS_DIR.resolve()}") print(f"💾 ChromaDB path : {Path(CHROMA_DB_PATH).resolve()}") print(f"📚 PDFs found : {len(pdf_files)}") print(f"{'─' * 50}\n") # ── Step 1: Load all PDFs ──────────────────────────────────────────────── all_docs = [] for pdf_path in pdf_files: book_name = detect_book_name(pdf_path.name) print(f"📕 {book_name}") raw_docs = load_pdf(pdf_path) tagged_docs = tag_documents(raw_docs, book_name, pdf_path.name) all_docs.extend(tagged_docs) print(f" ✅ Tagged as '{book_name}'\n") print(f"📄 Total pages loaded: {len(all_docs)}") # ── Step 2: Split into chunks ──────────────────────────────────────────── print(f"\n✂️ Splitting into chunks (size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})...") splitter = RecursiveCharacterTextSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, separators=["\n\n", "\n", ". ", " ", ""], # Respect paragraph/verse boundaries ) chunks = splitter.split_documents(all_docs) print(f" → {len(chunks)} chunks created") # Add verse citations to chunk metadata for better source attribution print(f"🏷️ Parsing structure (chapters/verses) for {len(chunks)} chunks...") for chunk in chunks: # Use the parse_structure function you defined structure = parse_structure(chunk.page_content, chunk.metadata["book"]) # Update the chunk metadata so it is saved in ChromaDB chunk.metadata.update(structure) print(f" → {len(chunks)} chunks created and tagged") # ── Step 3: Embed & store ──────────────────────────────────────────────── print(f"\n🔢 Initialising NVIDIA embedding model (llama-nemotron-embed-vl-1b-v2)...") embeddings = NVIDIAEmbeddings( model="nvidia/llama-nemotron-embed-vl-1b-v2", api_key=NVIDIA_API_KEY, truncate="NONE", ) print(f"💾 Building ChromaDB vector store — this may take a few minutes...") print(f" (Embedding {len(chunks)} chunks...)\n") # Process in batches to avoid rate limits BATCH_SIZE = 100 vector_store = None for i in range(0, len(chunks), BATCH_SIZE): batch = chunks[i : i + BATCH_SIZE] batch_num = i // BATCH_SIZE + 1 total_batches = (len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE print(f" Batch {batch_num}/{total_batches}: embedding {len(batch)} chunks...") if vector_store is None: vector_store = Chroma.from_documents( documents=batch, embedding=embeddings, persist_directory=CHROMA_DB_PATH, collection_name=COLLECTION_NAME, ) else: vector_store.add_documents(batch) print(f"\n{'─' * 50}") print(f"✅ Ingestion complete!") print(f" 📦 {len(chunks)} chunks stored in ChromaDB") print(f" 📂 Location: {Path(CHROMA_DB_PATH).resolve()}") print(f"\n👉 Now run: python app.py") print(f"{'─' * 50}\n") if __name__ == "__main__": ingest()