7th_handle / src /app.py
Adoption's picture
Update src/app.py
df6acdc verified
import os
import pickle
from typing import List, Dict, Set
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.retrievers import BM25Retriever
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
load_dotenv()
# ===============================
# CONFIG
# ===============================
INDEX_NAME = "branham-index"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
# ===============================
# CANONICAL SERIES
# ===============================
SEVEN_SEALS_CANON = [
"63-0317E The Breach Between The Church Ages And The Seven Seals.pdf",
"63-0317M God Hiding Himself In Simplicity, Then Revealing Himself In The Same.pdf",
"63-0318 The First Seal.pdf",
"63-0319 The Second Seal.pdf",
"63-0320 The Third Seal.pdf",
"63-0321 The Fourth Seal.pdf",
"63-0322 The Fifth Seal.pdf",
"63-0323 The Sixth Seal.pdf",
"63-0324E The Seventh Seal.pdf",
"63-0324M Questions And Answers On The Seals.pdf",
]
SERIES_GROUPS = {
"seven seals": SEVEN_SEALS_CANON,
}
# ===============================
# HELPERS
# ===============================
def normalize(text: str) -> str:
return text.lower().replace("_", " ").replace("-", " ").strip()
def load_chunks() -> List[Document]:
if not os.path.exists(CHUNKS_FILE):
return []
with open(CHUNKS_FILE, "rb") as f:
return pickle.load(f)
def extract_date_code(filename: str) -> str:
"""
Assumes filenames start with NN-NNNNE
Example: 62-0909E In His Presence.pdf
"""
return filename.split()[0].replace(".pdf", "")
def messagehub_link(filename: str) -> str:
code = extract_date_code(filename)
return f"https://www.messagehub.info/en/read.do?ref_num={code}"
import re
STOPWORDS = {
"the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
}
def normalize_text(text: str) -> str:
text = text.lower()
text = re.sub(r"[^a-z0-9\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def extract_sermon_title(filename: str) -> str:
"""
'62-0909E In His Presence.pdf' → 'in his presence'
"""
name = filename.replace(".pdf", "").replace(".PDF", "")
parts = name.split(" ", 1)
if len(parts) == 2 and "-" in parts[0]:
name = parts[1]
return normalize_text(name)
def tokenize_meaningful(text: str) -> set:
return {
w for w in normalize_text(text).split()
if w not in STOPWORDS and len(w) > 2
}
def sermon_title_matches(user_query: str, filename: str) -> bool:
"""
Match only if ALL meaningful title words exist in user query.
Prevents partial matches like 'presence'.
"""
title_tokens = tokenize_meaningful(extract_sermon_title(filename))
query_tokens = tokenize_meaningful(user_query)
if not title_tokens:
return False
return title_tokens.issubset(query_tokens)
# ===============================
# RETRIEVER
# ===============================
class BranhamRetriever(BaseRetriever):
"""
NotebookLM-style hybrid retriever:
- local priority
- semantic fallback
- series-aware
- safe + deduplicated
"""
def _get_relevant_documents(
self,
query: str,
*,
run_manager: CallbackManagerForRetrieverRun = None
) -> List[Document]:
query_clean = normalize(query)
chunks = load_chunks()
results: List[Document] = []
seen = set()
# -------------------------------------------------
# Detect sermon reference (date code)
# -------------------------------------------------
explicit_sermon = None
for token in query.split():
if "-" in token and len(token) >= 7:
explicit_sermon = token.upper()
break
# -------------------------------------------------
# Detect series
# -------------------------------------------------
target_titles = []
is_series = False
for key, titles in SERIES_GROUPS.items():
if key in query_clean:
target_titles = titles
is_series = True
break
# -------------------------------------------------
# SERMON-TARGETED SEARCH
# -------------------------------------------------
if explicit_sermon:
for d in chunks:
src = normalize(d.metadata.get("source", ""))
if sermon_title_matches(explicit_sermon, src):
key = d.page_content[:120]
if key not in seen:
results.append(d)
seen.add(key)
# -------------------------------------------------
# SERIES SEARCH
# -------------------------------------------------
elif target_titles:
for d in chunks:
src = normalize(d.metadata.get("source", ""))
if sermon_title_matches(query, src):
key = d.page_content[:120]
if key not in seen:
results.append(d)
seen.add(key)
# -------------------------------------------------
# KEYWORD SEARCH (LOCAL)
# -------------------------------------------------
if len(results) < 25:
bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 60
for d in bm25.invoke(query):
key = d.page_content[:120]
if key not in seen:
results.append(d)
seen.add(key)
# -------------------------------------------------
# VECTOR SEARCH (PINECONE)
# -------------------------------------------------
try:
embeddings = GoogleGenerativeAIEmbeddings(
model="models/text-embedding-004"
)
store = PineconeVectorStore(
index_name=INDEX_NAME,
embedding=embeddings
)
vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query)
for d in vec_docs:
key = d.page_content[:120]
if key not in seen:
results.append(d)
seen.add(key)
except Exception:
pass
return results
# ===============================
# PROMPT
# ===============================
PROMPT_TEMPLATE = """
You are William Marrion Branham, speaking carefully as a teacher and evangelist.
RULES:
- You are speaking to only one person
- Be faithful to the sermons provided.
- Do NOT invent doctrine.
- If something is not clearly stated in the text, say so.
- Use calm 1950s preaching tone.
- Be structured and clear.
- Use headings and bullet points.
- Explain symbols plainly.
- Prefer paraphrase, but preserve meaning.
- Avoid citations like (54) or paragraph numbers.
- Ignore tape noise or filler language.
- If a question asks for a sermon summary, summarize only that sermon.
- If the question references the Seven Seals, prioritize the 1963 series.
CONTEXT:
{context_str}
QUESTION:
{question}
ANSWER:
"""
PROMPT = PromptTemplate(
template=PROMPT_TEMPLATE,
input_variables=["context_str", "question"],
)
# ===============================
# PUBLIC API
# ===============================
def get_rag_chain():
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash",
temperature=0.25,
convert_system_message_to_human=True,
)
retriever = BranhamRetriever()
chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
chain_type="stuff",
return_source_documents=True,
chain_type_kwargs={
"prompt": PROMPT,
"document_variable_name": "context_str",
},
input_key="question",
)
return chain
def search_archives(query: str):
"""
Used by Search mode only.
Returns (documents, debug_log)
"""
debug = []
docs = []
seen = set()
chunks = load_chunks()
query_clean = normalize(query)
# Keyword search
for d in chunks:
if query_clean in d.page_content.lower():
key = d.page_content[:120]
if key not in seen:
docs.append(d)
seen.add(key)
debug.append(f"Keyword hits: {len(docs)}")
# Fallback BM25
if len(docs) < 20:
bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 50
for d in bm25.invoke(query):
key = d.page_content[:120]
if key not in seen:
docs.append(d)
seen.add(key)
debug.append(f"Total results: {len(docs)}")
return docs, debug