Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| from typing import List, Dict, Set | |
| from dotenv import load_dotenv | |
| from langchain_core.documents import Document | |
| from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings | |
| from langchain_pinecone import PineconeVectorStore | |
| from langchain_community.retrievers import BM25Retriever | |
| from langchain.chains import RetrievalQA | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain_core.retrievers import BaseRetriever | |
| from langchain_core.callbacks import CallbackManagerForRetrieverRun | |
| load_dotenv() | |
| # =============================== | |
| # CONFIG | |
| # =============================== | |
| INDEX_NAME = "branham-index" | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl") | |
| # =============================== | |
| # CANONICAL SERIES | |
| # =============================== | |
| SEVEN_SEALS_CANON = [ | |
| "63-0317E The Breach Between The Church Ages And The Seven Seals.pdf", | |
| "63-0317M God Hiding Himself In Simplicity, Then Revealing Himself In The Same.pdf", | |
| "63-0318 The First Seal.pdf", | |
| "63-0319 The Second Seal.pdf", | |
| "63-0320 The Third Seal.pdf", | |
| "63-0321 The Fourth Seal.pdf", | |
| "63-0322 The Fifth Seal.pdf", | |
| "63-0323 The Sixth Seal.pdf", | |
| "63-0324E The Seventh Seal.pdf", | |
| "63-0324M Questions And Answers On The Seals.pdf", | |
| ] | |
| SERIES_GROUPS = { | |
| "seven seals": SEVEN_SEALS_CANON, | |
| } | |
| # =============================== | |
| # HELPERS | |
| # =============================== | |
| def normalize(text: str) -> str: | |
| return text.lower().replace("_", " ").replace("-", " ").strip() | |
| def load_chunks() -> List[Document]: | |
| if not os.path.exists(CHUNKS_FILE): | |
| return [] | |
| with open(CHUNKS_FILE, "rb") as f: | |
| return pickle.load(f) | |
| def extract_date_code(filename: str) -> str: | |
| """ | |
| Assumes filenames start with NN-NNNNE | |
| Example: 62-0909E In His Presence.pdf | |
| """ | |
| return filename.split()[0].replace(".pdf", "") | |
| def messagehub_link(filename: str) -> str: | |
| code = extract_date_code(filename) | |
| return f"https://www.messagehub.info/en/read.do?ref_num={code}" | |
| import re | |
| STOPWORDS = { | |
| "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by" | |
| } | |
| def normalize_text(text: str) -> str: | |
| text = text.lower() | |
| text = re.sub(r"[^a-z0-9\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def extract_sermon_title(filename: str) -> str: | |
| """ | |
| '62-0909E In His Presence.pdf' → 'in his presence' | |
| """ | |
| name = filename.replace(".pdf", "").replace(".PDF", "") | |
| parts = name.split(" ", 1) | |
| if len(parts) == 2 and "-" in parts[0]: | |
| name = parts[1] | |
| return normalize_text(name) | |
| def tokenize_meaningful(text: str) -> set: | |
| return { | |
| w for w in normalize_text(text).split() | |
| if w not in STOPWORDS and len(w) > 2 | |
| } | |
| def sermon_title_matches(user_query: str, filename: str) -> bool: | |
| """ | |
| Match only if ALL meaningful title words exist in user query. | |
| Prevents partial matches like 'presence'. | |
| """ | |
| title_tokens = tokenize_meaningful(extract_sermon_title(filename)) | |
| query_tokens = tokenize_meaningful(user_query) | |
| if not title_tokens: | |
| return False | |
| return title_tokens.issubset(query_tokens) | |
| # =============================== | |
| # RETRIEVER | |
| # =============================== | |
| class BranhamRetriever(BaseRetriever): | |
| """ | |
| NotebookLM-style hybrid retriever: | |
| - local priority | |
| - semantic fallback | |
| - series-aware | |
| - safe + deduplicated | |
| """ | |
| def _get_relevant_documents( | |
| self, | |
| query: str, | |
| *, | |
| run_manager: CallbackManagerForRetrieverRun = None | |
| ) -> List[Document]: | |
| query_clean = normalize(query) | |
| chunks = load_chunks() | |
| results: List[Document] = [] | |
| seen = set() | |
| # ------------------------------------------------- | |
| # Detect sermon reference (date code) | |
| # ------------------------------------------------- | |
| explicit_sermon = None | |
| for token in query.split(): | |
| if "-" in token and len(token) >= 7: | |
| explicit_sermon = token.upper() | |
| break | |
| # ------------------------------------------------- | |
| # Detect series | |
| # ------------------------------------------------- | |
| target_titles = [] | |
| is_series = False | |
| for key, titles in SERIES_GROUPS.items(): | |
| if key in query_clean: | |
| target_titles = titles | |
| is_series = True | |
| break | |
| # ------------------------------------------------- | |
| # SERMON-TARGETED SEARCH | |
| # ------------------------------------------------- | |
| if explicit_sermon: | |
| for d in chunks: | |
| src = normalize(d.metadata.get("source", "")) | |
| if sermon_title_matches(explicit_sermon, src): | |
| key = d.page_content[:120] | |
| if key not in seen: | |
| results.append(d) | |
| seen.add(key) | |
| # ------------------------------------------------- | |
| # SERIES SEARCH | |
| # ------------------------------------------------- | |
| elif target_titles: | |
| for d in chunks: | |
| src = normalize(d.metadata.get("source", "")) | |
| if sermon_title_matches(query, src): | |
| key = d.page_content[:120] | |
| if key not in seen: | |
| results.append(d) | |
| seen.add(key) | |
| # ------------------------------------------------- | |
| # KEYWORD SEARCH (LOCAL) | |
| # ------------------------------------------------- | |
| if len(results) < 25: | |
| bm25 = BM25Retriever.from_documents(chunks) | |
| bm25.k = 60 | |
| for d in bm25.invoke(query): | |
| key = d.page_content[:120] | |
| if key not in seen: | |
| results.append(d) | |
| seen.add(key) | |
| # ------------------------------------------------- | |
| # VECTOR SEARCH (PINECONE) | |
| # ------------------------------------------------- | |
| try: | |
| embeddings = GoogleGenerativeAIEmbeddings( | |
| model="models/text-embedding-004" | |
| ) | |
| store = PineconeVectorStore( | |
| index_name=INDEX_NAME, | |
| embedding=embeddings | |
| ) | |
| vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query) | |
| for d in vec_docs: | |
| key = d.page_content[:120] | |
| if key not in seen: | |
| results.append(d) | |
| seen.add(key) | |
| except Exception: | |
| pass | |
| return results | |
| # =============================== | |
| # PROMPT | |
| # =============================== | |
| PROMPT_TEMPLATE = """ | |
| You are William Marrion Branham, speaking carefully as a teacher and evangelist. | |
| RULES: | |
| - You are speaking to only one person | |
| - Be faithful to the sermons provided. | |
| - Do NOT invent doctrine. | |
| - If something is not clearly stated in the text, say so. | |
| - Use calm 1950s preaching tone. | |
| - Be structured and clear. | |
| - Use headings and bullet points. | |
| - Explain symbols plainly. | |
| - Prefer paraphrase, but preserve meaning. | |
| - Avoid citations like (54) or paragraph numbers. | |
| - Ignore tape noise or filler language. | |
| - If a question asks for a sermon summary, summarize only that sermon. | |
| - If the question references the Seven Seals, prioritize the 1963 series. | |
| CONTEXT: | |
| {context_str} | |
| QUESTION: | |
| {question} | |
| ANSWER: | |
| """ | |
| PROMPT = PromptTemplate( | |
| template=PROMPT_TEMPLATE, | |
| input_variables=["context_str", "question"], | |
| ) | |
| # =============================== | |
| # PUBLIC API | |
| # =============================== | |
| def get_rag_chain(): | |
| llm = ChatGoogleGenerativeAI( | |
| model="gemini-2.5-flash", | |
| temperature=0.25, | |
| convert_system_message_to_human=True, | |
| ) | |
| retriever = BranhamRetriever() | |
| chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| retriever=retriever, | |
| chain_type="stuff", | |
| return_source_documents=True, | |
| chain_type_kwargs={ | |
| "prompt": PROMPT, | |
| "document_variable_name": "context_str", | |
| }, | |
| input_key="question", | |
| ) | |
| return chain | |
| def search_archives(query: str): | |
| """ | |
| Used by Search mode only. | |
| Returns (documents, debug_log) | |
| """ | |
| debug = [] | |
| docs = [] | |
| seen = set() | |
| chunks = load_chunks() | |
| query_clean = normalize(query) | |
| # Keyword search | |
| for d in chunks: | |
| if query_clean in d.page_content.lower(): | |
| key = d.page_content[:120] | |
| if key not in seen: | |
| docs.append(d) | |
| seen.add(key) | |
| debug.append(f"Keyword hits: {len(docs)}") | |
| # Fallback BM25 | |
| if len(docs) < 20: | |
| bm25 = BM25Retriever.from_documents(chunks) | |
| bm25.k = 50 | |
| for d in bm25.invoke(query): | |
| key = d.page_content[:120] | |
| if key not in seen: | |
| docs.append(d) | |
| seen.add(key) | |
| debug.append(f"Total results: {len(docs)}") | |
| return docs, debug | |