import os import pickle from typing import List, Dict, Set from dotenv import load_dotenv from langchain_core.documents import Document from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings from langchain_pinecone import PineconeVectorStore from langchain_community.retrievers import BM25Retriever from langchain.chains import RetrievalQA from langchain_core.prompts import PromptTemplate from langchain_core.retrievers import BaseRetriever from langchain_core.callbacks import CallbackManagerForRetrieverRun load_dotenv() # =============================== # CONFIG # =============================== INDEX_NAME = "branham-index" BASE_DIR = os.path.dirname(os.path.abspath(__file__)) CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl") # =============================== # CANONICAL SERIES # =============================== SEVEN_SEALS_CANON = [ "63-0317E The Breach Between The Church Ages And The Seven Seals.pdf", "63-0317M God Hiding Himself In Simplicity, Then Revealing Himself In The Same.pdf", "63-0318 The First Seal.pdf", "63-0319 The Second Seal.pdf", "63-0320 The Third Seal.pdf", "63-0321 The Fourth Seal.pdf", "63-0322 The Fifth Seal.pdf", "63-0323 The Sixth Seal.pdf", "63-0324E The Seventh Seal.pdf", "63-0324M Questions And Answers On The Seals.pdf", ] SERIES_GROUPS = { "seven seals": SEVEN_SEALS_CANON, } # =============================== # HELPERS # =============================== def normalize(text: str) -> str: return text.lower().replace("_", " ").replace("-", " ").strip() def load_chunks() -> List[Document]: if not os.path.exists(CHUNKS_FILE): return [] with open(CHUNKS_FILE, "rb") as f: return pickle.load(f) def extract_date_code(filename: str) -> str: """ Assumes filenames start with NN-NNNNE Example: 62-0909E In His Presence.pdf """ return filename.split()[0].replace(".pdf", "") def messagehub_link(filename: str) -> str: code = extract_date_code(filename) return f"https://www.messagehub.info/en/read.do?ref_num={code}" import re STOPWORDS = { "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by" } def normalize_text(text: str) -> str: text = text.lower() text = re.sub(r"[^a-z0-9\s]", " ", text) text = re.sub(r"\s+", " ", text).strip() return text def extract_sermon_title(filename: str) -> str: """ '62-0909E In His Presence.pdf' → 'in his presence' """ name = filename.replace(".pdf", "").replace(".PDF", "") parts = name.split(" ", 1) if len(parts) == 2 and "-" in parts[0]: name = parts[1] return normalize_text(name) def tokenize_meaningful(text: str) -> set: return { w for w in normalize_text(text).split() if w not in STOPWORDS and len(w) > 2 } def sermon_title_matches(user_query: str, filename: str) -> bool: """ Match only if ALL meaningful title words exist in user query. Prevents partial matches like 'presence'. """ title_tokens = tokenize_meaningful(extract_sermon_title(filename)) query_tokens = tokenize_meaningful(user_query) if not title_tokens: return False return title_tokens.issubset(query_tokens) # =============================== # RETRIEVER # =============================== class BranhamRetriever(BaseRetriever): """ NotebookLM-style hybrid retriever: - local priority - semantic fallback - series-aware - safe + deduplicated """ def _get_relevant_documents( self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None ) -> List[Document]: query_clean = normalize(query) chunks = load_chunks() results: List[Document] = [] seen = set() # ------------------------------------------------- # Detect sermon reference (date code) # ------------------------------------------------- explicit_sermon = None for token in query.split(): if "-" in token and len(token) >= 7: explicit_sermon = token.upper() break # ------------------------------------------------- # Detect series # ------------------------------------------------- target_titles = [] is_series = False for key, titles in SERIES_GROUPS.items(): if key in query_clean: target_titles = titles is_series = True break # ------------------------------------------------- # SERMON-TARGETED SEARCH # ------------------------------------------------- if explicit_sermon: for d in chunks: src = normalize(d.metadata.get("source", "")) if sermon_title_matches(explicit_sermon, src): key = d.page_content[:120] if key not in seen: results.append(d) seen.add(key) # ------------------------------------------------- # SERIES SEARCH # ------------------------------------------------- elif target_titles: for d in chunks: src = normalize(d.metadata.get("source", "")) if sermon_title_matches(query, src): key = d.page_content[:120] if key not in seen: results.append(d) seen.add(key) # ------------------------------------------------- # KEYWORD SEARCH (LOCAL) # ------------------------------------------------- if len(results) < 25: bm25 = BM25Retriever.from_documents(chunks) bm25.k = 60 for d in bm25.invoke(query): key = d.page_content[:120] if key not in seen: results.append(d) seen.add(key) # ------------------------------------------------- # VECTOR SEARCH (PINECONE) # ------------------------------------------------- try: embeddings = GoogleGenerativeAIEmbeddings( model="models/text-embedding-004" ) store = PineconeVectorStore( index_name=INDEX_NAME, embedding=embeddings ) vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query) for d in vec_docs: key = d.page_content[:120] if key not in seen: results.append(d) seen.add(key) except Exception: pass return results # =============================== # PROMPT # =============================== PROMPT_TEMPLATE = """ You are William Marrion Branham, speaking carefully as a teacher and evangelist. RULES: - You are speaking to only one person - Be faithful to the sermons provided. - Do NOT invent doctrine. - If something is not clearly stated in the text, say so. - Use calm 1950s preaching tone. - Be structured and clear. - Use headings and bullet points. - Explain symbols plainly. - Prefer paraphrase, but preserve meaning. - Avoid citations like (54) or paragraph numbers. - Ignore tape noise or filler language. - If a question asks for a sermon summary, summarize only that sermon. - If the question references the Seven Seals, prioritize the 1963 series. CONTEXT: {context_str} QUESTION: {question} ANSWER: """ PROMPT = PromptTemplate( template=PROMPT_TEMPLATE, input_variables=["context_str", "question"], ) # =============================== # PUBLIC API # =============================== def get_rag_chain(): llm = ChatGoogleGenerativeAI( model="gemini-2.5-flash", temperature=0.25, convert_system_message_to_human=True, ) retriever = BranhamRetriever() chain = RetrievalQA.from_chain_type( llm=llm, retriever=retriever, chain_type="stuff", return_source_documents=True, chain_type_kwargs={ "prompt": PROMPT, "document_variable_name": "context_str", }, input_key="question", ) return chain def search_archives(query: str): """ Used by Search mode only. Returns (documents, debug_log) """ debug = [] docs = [] seen = set() chunks = load_chunks() query_clean = normalize(query) # Keyword search for d in chunks: if query_clean in d.page_content.lower(): key = d.page_content[:120] if key not in seen: docs.append(d) seen.add(key) debug.append(f"Keyword hits: {len(docs)}") # Fallback BM25 if len(docs) < 20: bm25 = BM25Retriever.from_documents(chunks) bm25.k = 50 for d in bm25.invoke(query): key = d.page_content[:120] if key not in seen: docs.append(d) seen.add(key) debug.append(f"Total results: {len(docs)}") return docs, debug