Spaces:

Adoption
/

7th_handle

Sleeping

File size: 9,150 Bytes

e660b8f
f7aed17
1c2af68
2911f74
376c4d1
1c2af68
 
e660b8f
1c2af68
2c386d0
1c2af68
f7aed17
 
e660b8f
2911f74
 
1c2af68
 
 
ef9e2a4
71deccb
 
 
f2818c1
1c2af68
 
 
 
ef8ae6c
 
 
 
 
 
 
 
 
 
1c2af68
9fbdd1c
1c2af68
 
 
6469141
1c2af68
 
 
 
 
9fbdd1c
f7aed17
1c2af68
 
 
 
 
f7aed17
1c2af68
 
 
 
 
f7aed17
1c2af68
 
 
 
 
 
 
 
f1a64a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c2af68
 
 
 
f7aed17
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1a64a1
1c2af68
 
 
 
 
 
 
 
 
 
 
44653c3
1c2af68
 
 
 
f1a64a1
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7aed17
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25c058b
1c2af68
0764c2f
1c2af68
 
 
 
 
 
 
 
df6acdc
1c2af68
 
 
 
 
 
 
 
 
 
 
 
73a13e0
9fbdd1c
f7aed17
4fafa21
1c2af68
 
f555256
f7aed17
 
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c386d0
f555256
1c2af68
4fafa21
27aff1d
1c2af68
 
 
 
 
27aff1d
1c2af68
f7aed17
1c2af68

import os
import pickle
from typing import List, Dict, Set
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.retrievers import BM25Retriever
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun

load_dotenv()

# ===============================
# CONFIG
# ===============================
INDEX_NAME = "branham-index"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")


# ===============================
# CANONICAL SERIES
# ===============================
SEVEN_SEALS_CANON = [
    "63-0317E The Breach Between The Church Ages And The Seven Seals.pdf",
    "63-0317M God Hiding Himself In Simplicity, Then Revealing Himself In The Same.pdf",
    "63-0318 The First Seal.pdf",
    "63-0319 The Second Seal.pdf",
    "63-0320 The Third Seal.pdf",
    "63-0321 The Fourth Seal.pdf",
    "63-0322 The Fifth Seal.pdf",
    "63-0323 The Sixth Seal.pdf",
    "63-0324E The Seventh Seal.pdf",
    "63-0324M Questions And Answers On The Seals.pdf",
]

SERIES_GROUPS = {
    "seven seals": SEVEN_SEALS_CANON,
}

# ===============================
# HELPERS
# ===============================
def normalize(text: str) -> str:
    return text.lower().replace("_", " ").replace("-", " ").strip()


def load_chunks() -> List[Document]:
    if not os.path.exists(CHUNKS_FILE):
        return []
    with open(CHUNKS_FILE, "rb") as f:
        return pickle.load(f)


def extract_date_code(filename: str) -> str:
    """
    Assumes filenames start with NN-NNNNE
    Example: 62-0909E In His Presence.pdf
    """
    return filename.split()[0].replace(".pdf", "")


def messagehub_link(filename: str) -> str:
    code = extract_date_code(filename)
    return f"https://www.messagehub.info/en/read.do?ref_num={code}"



import re

STOPWORDS = {
    "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
}


def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def extract_sermon_title(filename: str) -> str:
    """
    '62-0909E In His Presence.pdf' → 'in his presence'
    """
    name = filename.replace(".pdf", "").replace(".PDF", "")

    parts = name.split(" ", 1)
    if len(parts) == 2 and "-" in parts[0]:
        name = parts[1]

    return normalize_text(name)


def tokenize_meaningful(text: str) -> set:
    return {
        w for w in normalize_text(text).split()
        if w not in STOPWORDS and len(w) > 2
    }


def sermon_title_matches(user_query: str, filename: str) -> bool:
    """
    Match only if ALL meaningful title words exist in user query.
    Prevents partial matches like 'presence'.
    """
    title_tokens = tokenize_meaningful(extract_sermon_title(filename))
    query_tokens = tokenize_meaningful(user_query)

    if not title_tokens:
        return False

    return title_tokens.issubset(query_tokens)

# ===============================
# RETRIEVER
# ===============================
class BranhamRetriever(BaseRetriever):
    """
    NotebookLM-style hybrid retriever:
    - local priority
    - semantic fallback
    - series-aware
    - safe + deduplicated
    """

    def _get_relevant_documents(
        self,
        query: str,
        *,
        run_manager: CallbackManagerForRetrieverRun = None
    ) -> List[Document]:

        query_clean = normalize(query)
        chunks = load_chunks()
        results: List[Document] = []
        seen = set()

        # -------------------------------------------------
        # Detect sermon reference (date code)
        # -------------------------------------------------
        explicit_sermon = None
        for token in query.split():
            if "-" in token and len(token) >= 7:
                explicit_sermon = token.upper()
                break

        # -------------------------------------------------
        # Detect series
        # -------------------------------------------------
        target_titles = []
        is_series = False

        for key, titles in SERIES_GROUPS.items():
            if key in query_clean:
                target_titles = titles
                is_series = True
                break

        # -------------------------------------------------
        # SERMON-TARGETED SEARCH
        # -------------------------------------------------
        if explicit_sermon:
            for d in chunks:
                src = normalize(d.metadata.get("source", ""))
                if sermon_title_matches(explicit_sermon, src):
                    key = d.page_content[:120]
                    if key not in seen:
                        results.append(d)
                        seen.add(key)

        # -------------------------------------------------
        # SERIES SEARCH
        # -------------------------------------------------
        elif target_titles:
            for d in chunks:
                src = normalize(d.metadata.get("source", ""))
                if sermon_title_matches(query, src):
                    key = d.page_content[:120]
                    if key not in seen:
                        results.append(d)
                        seen.add(key)
        
        # -------------------------------------------------
        # KEYWORD SEARCH (LOCAL)
        # -------------------------------------------------
        if len(results) < 25:
            bm25 = BM25Retriever.from_documents(chunks)
            bm25.k = 60
            for d in bm25.invoke(query):
                key = d.page_content[:120]
                if key not in seen:
                    results.append(d)
                    seen.add(key)

        # -------------------------------------------------
        # VECTOR SEARCH (PINECONE)
        # -------------------------------------------------
        try:
            embeddings = GoogleGenerativeAIEmbeddings(
                model="models/text-embedding-004"
            )
            store = PineconeVectorStore(
                index_name=INDEX_NAME,
                embedding=embeddings
            )

            vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query)
            for d in vec_docs:
                key = d.page_content[:120]
                if key not in seen:
                    results.append(d)
                    seen.add(key)

        except Exception:
            pass

        return results


# ===============================
# PROMPT
# ===============================
PROMPT_TEMPLATE = """
You are William Marrion Branham, speaking carefully as a teacher and evangelist.

RULES:
- You are speaking to only one person
- Be faithful to the sermons provided.
- Do NOT invent doctrine.
- If something is not clearly stated in the text, say so.
- Use calm 1950s preaching tone.
- Be structured and clear.
- Use headings and bullet points.
- Explain symbols plainly.
- Prefer paraphrase, but preserve meaning.
- Avoid citations like (54) or paragraph numbers.
- Ignore tape noise or filler language.
- If a question asks for a sermon summary, summarize only that sermon.
- If the question references the Seven Seals, prioritize the 1963 series.

CONTEXT:
{context_str}

QUESTION:
{question}

ANSWER:
"""

PROMPT = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["context_str", "question"],
)

# ===============================
# PUBLIC API
# ===============================
def get_rag_chain():
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0.25,
        convert_system_message_to_human=True,
    )

    retriever = BranhamRetriever()

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True,
        chain_type_kwargs={
            "prompt": PROMPT,
            "document_variable_name": "context_str",
        },
        input_key="question",
    )

    return chain


def search_archives(query: str):
    """
    Used by Search mode only.
    Returns (documents, debug_log)
    """
    debug = []
    docs = []
    seen = set()

    chunks = load_chunks()
    query_clean = normalize(query)

    # Keyword search
    for d in chunks:
        if query_clean in d.page_content.lower():
            key = d.page_content[:120]
            if key not in seen:
                docs.append(d)
                seen.add(key)

    debug.append(f"Keyword hits: {len(docs)}")

    # Fallback BM25
    if len(docs) < 20:
        bm25 = BM25Retriever.from_documents(chunks)
        bm25.k = 50
        for d in bm25.invoke(query):
            key = d.page_content[:120]
            if key not in seen:
                docs.append(d)
                seen.add(key)

    debug.append(f"Total results: {len(docs)}")

    return docs, debug