File size: 8,982 Bytes
22fd41f 7ae27cd 22fd41f 7ae27cd 22fd41f 7ae27cd 22fd41f 7ae27cd 22fd41f 7ae27cd 22fd41f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | """
ingest.py β Step 1: Build the vector knowledge base from religious PDFs.
Run this ONCE before starting the app:
python ingest.py
It will:
1. Load all PDFs from the ./books/ directory
2. Split them into overlapping semantic chunks
3. Embed each chunk using NVIDIA's llama-nemotron embedding model
4. Persist everything into a local ChromaDB vector store
"""
import os
import sys
from pathlib import Path
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_chroma import Chroma
import re
load_dotenv()
# βββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BOOKS_DIR = Path("./books")
CHROMA_DB_PATH = os.getenv("CHROMA_DB_PATH", "./chroma_db")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "sacred_texts")
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
# Mapping of filename keywords β friendly book name stored in metadata
BOOK_NAME_MAP = {
"gita": "Bhagavad Gita",
"bhagavad": "Bhagavad Gita",
"quran": "Quran",
"koran": "Quran",
"bible": "Bible",
"testament": "Bible",
"granth": "Guru Granth Sahib", # β ADD
"guru": "Guru Granth Sahib", # β ADD
}
# Chunk settings β tuned for religious texts (verses are short)
CHUNK_SIZE = 800 # characters per chunk
CHUNK_OVERLAP = 150 # overlap to preserve verse context across boundaries
# Regex patterns for different scriptures
VERSE_PATTERNS = {
"Bhagavad Gita": r"(?:Verse\s+)?(\d+\.\d+)", # Matches 2.47 or Verse 2.47
"Quran": r"(\d+:\d+)", # Matches 2:286
"Bible": r"(\d+\s+)?[A-Z][a-z]+\s+\d+:\d+", # Matches John 3:16 or 1 Cor 13:4
"Guru Granth Sahib": r"(?:Ang\s+)?(\d+)" # Matches Ang 1 or 1
}
# Patterns to identify structure in the text
STRUCTURE_PATTERNS = {
"Bhagavad Gita": r"(\d+)\.(\d+)", # Matches 2.47 (Chapter.Verse)
"Quran": r"(\d+):(\d+)", # Matches 2:186 (Surah:Verse)
"Bible": r"(\d+):(\d+)", # Matches 3:16 (Chapter:Verse)
"Guru Granth Sahib": r"Ang\s+(\d+)" # Matches Ang 1
}
# βββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def parse_structure(text, book_name):
pattern = STRUCTURE_PATTERNS.get(book_name)
if not pattern:
return {}
match = re.search(pattern, text)
if match:
if book_name == "Guru Granth Sahib":
return {"ang": int(match.group(1))}
return {"chapter": int(match.group(1)), "verse": int(match.group(2))}
return {}
def extract_verse(text: str, book_name: str) -> str:
"""Extracts a verse reference from a text chunk based on the book."""
pattern = VERSE_PATTERNS.get(book_name)
if not pattern:
return "Unknown"
match = re.search(pattern, text)
return match.group(0) if match else "General Context"
def detect_book_name(filename: str) -> str:
"""Infer the book's display name from its filename."""
name_lower = filename.lower()
for keyword, book_name in BOOK_NAME_MAP.items():
if keyword in name_lower:
return book_name
# Fallback: use the filename stem, title-cased
return Path(filename).stem.replace("_", " ").title()
def load_pdf(pdf_path: Path) -> list:
"""
Load a PDF using PyMuPDF (preferred) or PyPDF as fallback.
Returns a list of LangChain Document objects.
"""
try:
loader = PyMuPDFLoader(str(pdf_path))
print(f" π Loading with PyMuPDF: {pdf_path.name}")
except Exception:
loader = PyPDFLoader(str(pdf_path))
print(f" π Loading with PyPDF: {pdf_path.name}")
docs = loader.load()
print(f" β {len(docs)} pages loaded")
return docs
def tag_documents(docs: list, book_name: str, source_file: str) -> list:
"""
Enrich each document's metadata with:
- book: display name (e.g. "Bhagavad Gita")
- source_file: original filename
"""
for doc in docs:
doc.metadata["book"] = book_name
doc.metadata["verse_citation"] = extract_verse(doc.page_content, book_name)
doc.metadata["source_file"] = source_file
# Keep the page number if already present from the loader
if "page" not in doc.metadata:
doc.metadata["page"] = 0
return docs
# βββ Main Ingestion βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def ingest():
if not NVIDIA_API_KEY:
print("β NVIDIA_API_KEY not set. Add it to your .env file.")
sys.exit(1)
if not BOOKS_DIR.exists():
print(f"β Books directory not found: {BOOKS_DIR.resolve()}")
print(" Create a ./books/ folder and add your PDFs there.")
sys.exit(1)
pdf_files = list(BOOKS_DIR.glob("*.pdf"))
if not pdf_files:
print(f"β No PDF files found in {BOOKS_DIR.resolve()}")
sys.exit(1)
print(f"\nποΈ Sacred Texts RAG β Ingestion Pipeline")
print(f"{'β' * 50}")
print(f"π Books directory : {BOOKS_DIR.resolve()}")
print(f"πΎ ChromaDB path : {Path(CHROMA_DB_PATH).resolve()}")
print(f"π PDFs found : {len(pdf_files)}")
print(f"{'β' * 50}\n")
# ββ Step 1: Load all PDFs ββββββββββββββββββββββββββββββββββββββββββββββββ
all_docs = []
for pdf_path in pdf_files:
book_name = detect_book_name(pdf_path.name)
print(f"π {book_name}")
raw_docs = load_pdf(pdf_path)
tagged_docs = tag_documents(raw_docs, book_name, pdf_path.name)
all_docs.extend(tagged_docs)
print(f" β
Tagged as '{book_name}'\n")
print(f"π Total pages loaded: {len(all_docs)}")
# ββ Step 2: Split into chunks ββββββββββββββββββββββββββββββββββββββββββββ
print(f"\nβοΈ Splitting into chunks (size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})...")
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
separators=["\n\n", "\n", ". ", " ", ""], # Respect paragraph/verse boundaries
)
chunks = splitter.split_documents(all_docs)
print(f" β {len(chunks)} chunks created")
# Add verse citations to chunk metadata for better source attribution
print(f"π·οΈ Parsing structure (chapters/verses) for {len(chunks)} chunks...")
for chunk in chunks:
# Use the parse_structure function you defined
structure = parse_structure(chunk.page_content, chunk.metadata["book"])
# Update the chunk metadata so it is saved in ChromaDB
chunk.metadata.update(structure)
print(f" β {len(chunks)} chunks created and tagged")
# ββ Step 3: Embed & store ββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"\nπ’ Initialising NVIDIA embedding model (llama-nemotron-embed-vl-1b-v2)...")
embeddings = NVIDIAEmbeddings(
model="nvidia/llama-nemotron-embed-vl-1b-v2",
api_key=NVIDIA_API_KEY,
truncate="NONE",
)
print(f"πΎ Building ChromaDB vector store β this may take a few minutes...")
print(f" (Embedding {len(chunks)} chunks...)\n")
# Process in batches to avoid rate limits
BATCH_SIZE = 100
vector_store = None
for i in range(0, len(chunks), BATCH_SIZE):
batch = chunks[i : i + BATCH_SIZE]
batch_num = i // BATCH_SIZE + 1
total_batches = (len(chunks) + BATCH_SIZE - 1) // BATCH_SIZE
print(f" Batch {batch_num}/{total_batches}: embedding {len(batch)} chunks...")
if vector_store is None:
vector_store = Chroma.from_documents(
documents=batch,
embedding=embeddings,
persist_directory=CHROMA_DB_PATH,
collection_name=COLLECTION_NAME,
)
else:
vector_store.add_documents(batch)
print(f"\n{'β' * 50}")
print(f"β
Ingestion complete!")
print(f" π¦ {len(chunks)} chunks stored in ChromaDB")
print(f" π Location: {Path(CHROMA_DB_PATH).resolve()}")
print(f"\nπ Now run: python app.py")
print(f"{'β' * 50}\n")
if __name__ == "__main__":
ingest() |