Spaces:

Adoption
/

7th_handle

Sleeping

App Files Files Community

7th_handle / src /app.py

Adoption

Update src/app.py

df6acdc verified about 1 month ago

raw

history blame contribute delete

9.15 kB

	import os
	import pickle
	from typing import List, Dict, Set
	from dotenv import load_dotenv

	from langchain_core.documents import Document
	from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
	from langchain_pinecone import PineconeVectorStore
	from langchain_community.retrievers import BM25Retriever
	from langchain.chains import RetrievalQA
	from langchain_core.prompts import PromptTemplate
	from langchain_core.retrievers import BaseRetriever
	from langchain_core.callbacks import CallbackManagerForRetrieverRun

	load_dotenv()

	# ===============================
	# CONFIG
	# ===============================
	INDEX_NAME = "branham-index"
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")


	# ===============================
	# CANONICAL SERIES
	# ===============================
	SEVEN_SEALS_CANON = [
	"63-0317E The Breach Between The Church Ages And The Seven Seals.pdf",
	"63-0317M God Hiding Himself In Simplicity, Then Revealing Himself In The Same.pdf",
	"63-0318 The First Seal.pdf",
	"63-0319 The Second Seal.pdf",
	"63-0320 The Third Seal.pdf",
	"63-0321 The Fourth Seal.pdf",
	"63-0322 The Fifth Seal.pdf",
	"63-0323 The Sixth Seal.pdf",
	"63-0324E The Seventh Seal.pdf",
	"63-0324M Questions And Answers On The Seals.pdf",
	]

	SERIES_GROUPS = {
	"seven seals": SEVEN_SEALS_CANON,
	}

	# ===============================
	# HELPERS
	# ===============================
	def normalize(text: str) -> str:
	return text.lower().replace("_", " ").replace("-", " ").strip()


	def load_chunks() -> List[Document]:
	if not os.path.exists(CHUNKS_FILE):
	return []
	with open(CHUNKS_FILE, "rb") as f:
	return pickle.load(f)


	def extract_date_code(filename: str) -> str:
	"""
	Assumes filenames start with NN-NNNNE
	Example: 62-0909E In His Presence.pdf
	"""
	return filename.split()[0].replace(".pdf", "")


	def messagehub_link(filename: str) -> str:
	code = extract_date_code(filename)
	return f"https://www.messagehub.info/en/read.do?ref_num={code}"



	import re

	STOPWORDS = {
	"the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
	}


	def normalize_text(text: str) -> str:
	text = text.lower()
	text = re.sub(r"[^a-z0-9\s]", " ", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def extract_sermon_title(filename: str) -> str:
	"""
	'62-0909E In His Presence.pdf' → 'in his presence'
	"""
	name = filename.replace(".pdf", "").replace(".PDF", "")

	parts = name.split(" ", 1)
	if len(parts) == 2 and "-" in parts[0]:
	name = parts[1]

	return normalize_text(name)


	def tokenize_meaningful(text: str) -> set:
	return {
	w for w in normalize_text(text).split()
	if w not in STOPWORDS and len(w) > 2
	}


	def sermon_title_matches(user_query: str, filename: str) -> bool:
	"""
	Match only if ALL meaningful title words exist in user query.
	Prevents partial matches like 'presence'.
	"""
	title_tokens = tokenize_meaningful(extract_sermon_title(filename))
	query_tokens = tokenize_meaningful(user_query)

	if not title_tokens:
	return False

	return title_tokens.issubset(query_tokens)

	# ===============================
	# RETRIEVER
	# ===============================
	class BranhamRetriever(BaseRetriever):
	"""
	NotebookLM-style hybrid retriever:
	- local priority
	- semantic fallback
	- series-aware
	- safe + deduplicated
	"""

	def _get_relevant_documents(
	self,
	query: str,
	*,
	run_manager: CallbackManagerForRetrieverRun = None
	) -> List[Document]:

	query_clean = normalize(query)
	chunks = load_chunks()
	results: List[Document] = []
	seen = set()

	# -------------------------------------------------
	# Detect sermon reference (date code)
	# -------------------------------------------------
	explicit_sermon = None
	for token in query.split():
	if "-" in token and len(token) >= 7:
	explicit_sermon = token.upper()
	break

	# -------------------------------------------------
	# Detect series
	# -------------------------------------------------
	target_titles = []
	is_series = False

	for key, titles in SERIES_GROUPS.items():
	if key in query_clean:
	target_titles = titles
	is_series = True
	break

	# -------------------------------------------------
	# SERMON-TARGETED SEARCH
	# -------------------------------------------------
	if explicit_sermon:
	for d in chunks:
	src = normalize(d.metadata.get("source", ""))
	if sermon_title_matches(explicit_sermon, src):
	key = d.page_content[:120]
	if key not in seen:
	results.append(d)
	seen.add(key)

	# -------------------------------------------------
	# SERIES SEARCH
	# -------------------------------------------------
	elif target_titles:
	for d in chunks:
	src = normalize(d.metadata.get("source", ""))
	if sermon_title_matches(query, src):
	key = d.page_content[:120]
	if key not in seen:
	results.append(d)
	seen.add(key)

	# -------------------------------------------------
	# KEYWORD SEARCH (LOCAL)
	# -------------------------------------------------
	if len(results) < 25:
	bm25 = BM25Retriever.from_documents(chunks)
	bm25.k = 60
	for d in bm25.invoke(query):
	key = d.page_content[:120]
	if key not in seen:
	results.append(d)
	seen.add(key)

	# -------------------------------------------------
	# VECTOR SEARCH (PINECONE)
	# -------------------------------------------------
	try:
	embeddings = GoogleGenerativeAIEmbeddings(
	model="models/text-embedding-004"
	)
	store = PineconeVectorStore(
	index_name=INDEX_NAME,
	embedding=embeddings
	)

	vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query)
	for d in vec_docs:
	key = d.page_content[:120]
	if key not in seen:
	results.append(d)
	seen.add(key)

	except Exception:
	pass

	return results


	# ===============================
	# PROMPT
	# ===============================
	PROMPT_TEMPLATE = """
	You are William Marrion Branham, speaking carefully as a teacher and evangelist.

	RULES:
	- You are speaking to only one person
	- Be faithful to the sermons provided.
	- Do NOT invent doctrine.
	- If something is not clearly stated in the text, say so.
	- Use calm 1950s preaching tone.
	- Be structured and clear.
	- Use headings and bullet points.
	- Explain symbols plainly.
	- Prefer paraphrase, but preserve meaning.
	- Avoid citations like (54) or paragraph numbers.
	- Ignore tape noise or filler language.
	- If a question asks for a sermon summary, summarize only that sermon.
	- If the question references the Seven Seals, prioritize the 1963 series.

	CONTEXT:
	{context_str}

	QUESTION:
	{question}

	ANSWER:
	"""

	PROMPT = PromptTemplate(
	template=PROMPT_TEMPLATE,
	input_variables=["context_str", "question"],
	)

	# ===============================
	# PUBLIC API
	# ===============================
	def get_rag_chain():
	llm = ChatGoogleGenerativeAI(
	model="gemini-2.5-flash",
	temperature=0.25,
	convert_system_message_to_human=True,
	)

	retriever = BranhamRetriever()

	chain = RetrievalQA.from_chain_type(
	llm=llm,
	retriever=retriever,
	chain_type="stuff",
	return_source_documents=True,
	chain_type_kwargs={
	"prompt": PROMPT,
	"document_variable_name": "context_str",
	},
	input_key="question",
	)

	return chain


	def search_archives(query: str):
	"""
	Used by Search mode only.
	Returns (documents, debug_log)
	"""
	debug = []
	docs = []
	seen = set()

	chunks = load_chunks()
	query_clean = normalize(query)

	# Keyword search
	for d in chunks:
	if query_clean in d.page_content.lower():
	key = d.page_content[:120]
	if key not in seen:
	docs.append(d)
	seen.add(key)

	debug.append(f"Keyword hits: {len(docs)}")

	# Fallback BM25
	if len(docs) < 20:
	bm25 = BM25Retriever.from_documents(chunks)
	bm25.k = 50
	for d in bm25.invoke(query):
	key = d.page_content[:120]
	if key not in seen:
	docs.append(d)
	seen.add(key)

	debug.append(f"Total results: {len(docs)}")

	return docs, debug