File size: 9,150 Bytes
e660b8f
f7aed17
1c2af68
2911f74
376c4d1
1c2af68
 
e660b8f
1c2af68
2c386d0
1c2af68
f7aed17
 
e660b8f
2911f74
 
1c2af68
 
 
ef9e2a4
71deccb
 
 
f2818c1
1c2af68
 
 
 
ef8ae6c
 
 
 
 
 
 
 
 
 
1c2af68
9fbdd1c
1c2af68
 
 
6469141
1c2af68
 
 
 
 
9fbdd1c
f7aed17
1c2af68
 
 
 
 
f7aed17
1c2af68
 
 
 
 
f7aed17
1c2af68
 
 
 
 
 
 
 
f1a64a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c2af68
 
 
 
f7aed17
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1a64a1
1c2af68
 
 
 
 
 
 
 
 
 
 
44653c3
1c2af68
 
 
 
f1a64a1
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7aed17
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25c058b
1c2af68
0764c2f
1c2af68
 
 
 
 
 
 
 
df6acdc
1c2af68
 
 
 
 
 
 
 
 
 
 
 
73a13e0
9fbdd1c
f7aed17
4fafa21
1c2af68
 
f555256
f7aed17
 
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c386d0
f555256
1c2af68
4fafa21
27aff1d
1c2af68
 
 
 
 
27aff1d
1c2af68
f7aed17
1c2af68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
import os
import pickle
from typing import List, Dict, Set
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.retrievers import BM25Retriever
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun

load_dotenv()

# ===============================
# CONFIG
# ===============================
INDEX_NAME = "branham-index"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")


# ===============================
# CANONICAL SERIES
# ===============================
SEVEN_SEALS_CANON = [
    "63-0317E The Breach Between The Church Ages And The Seven Seals.pdf",
    "63-0317M God Hiding Himself In Simplicity, Then Revealing Himself In The Same.pdf",
    "63-0318 The First Seal.pdf",
    "63-0319 The Second Seal.pdf",
    "63-0320 The Third Seal.pdf",
    "63-0321 The Fourth Seal.pdf",
    "63-0322 The Fifth Seal.pdf",
    "63-0323 The Sixth Seal.pdf",
    "63-0324E The Seventh Seal.pdf",
    "63-0324M Questions And Answers On The Seals.pdf",
]

SERIES_GROUPS = {
    "seven seals": SEVEN_SEALS_CANON,
}

# ===============================
# HELPERS
# ===============================
def normalize(text: str) -> str:
    return text.lower().replace("_", " ").replace("-", " ").strip()


def load_chunks() -> List[Document]:
    if not os.path.exists(CHUNKS_FILE):
        return []
    with open(CHUNKS_FILE, "rb") as f:
        return pickle.load(f)


def extract_date_code(filename: str) -> str:
    """
    Assumes filenames start with NN-NNNNE
    Example: 62-0909E In His Presence.pdf
    """
    return filename.split()[0].replace(".pdf", "")


def messagehub_link(filename: str) -> str:
    code = extract_date_code(filename)
    return f"https://www.messagehub.info/en/read.do?ref_num={code}"



import re

STOPWORDS = {
    "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
}


def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def extract_sermon_title(filename: str) -> str:
    """
    '62-0909E In His Presence.pdf' → 'in his presence'
    """
    name = filename.replace(".pdf", "").replace(".PDF", "")

    parts = name.split(" ", 1)
    if len(parts) == 2 and "-" in parts[0]:
        name = parts[1]

    return normalize_text(name)


def tokenize_meaningful(text: str) -> set:
    return {
        w for w in normalize_text(text).split()
        if w not in STOPWORDS and len(w) > 2
    }


def sermon_title_matches(user_query: str, filename: str) -> bool:
    """
    Match only if ALL meaningful title words exist in user query.
    Prevents partial matches like 'presence'.
    """
    title_tokens = tokenize_meaningful(extract_sermon_title(filename))
    query_tokens = tokenize_meaningful(user_query)

    if not title_tokens:
        return False

    return title_tokens.issubset(query_tokens)

# ===============================
# RETRIEVER
# ===============================
class BranhamRetriever(BaseRetriever):
    """
    NotebookLM-style hybrid retriever:
    - local priority
    - semantic fallback
    - series-aware
    - safe + deduplicated
    """

    def _get_relevant_documents(
        self,
        query: str,
        *,
        run_manager: CallbackManagerForRetrieverRun = None
    ) -> List[Document]:

        query_clean = normalize(query)
        chunks = load_chunks()
        results: List[Document] = []
        seen = set()

        # -------------------------------------------------
        # Detect sermon reference (date code)
        # -------------------------------------------------
        explicit_sermon = None
        for token in query.split():
            if "-" in token and len(token) >= 7:
                explicit_sermon = token.upper()
                break

        # -------------------------------------------------
        # Detect series
        # -------------------------------------------------
        target_titles = []
        is_series = False

        for key, titles in SERIES_GROUPS.items():
            if key in query_clean:
                target_titles = titles
                is_series = True
                break

        # -------------------------------------------------
        # SERMON-TARGETED SEARCH
        # -------------------------------------------------
        if explicit_sermon:
            for d in chunks:
                src = normalize(d.metadata.get("source", ""))
                if sermon_title_matches(explicit_sermon, src):
                    key = d.page_content[:120]
                    if key not in seen:
                        results.append(d)
                        seen.add(key)

        # -------------------------------------------------
        # SERIES SEARCH
        # -------------------------------------------------
        elif target_titles:
            for d in chunks:
                src = normalize(d.metadata.get("source", ""))
                if sermon_title_matches(query, src):
                    key = d.page_content[:120]
                    if key not in seen:
                        results.append(d)
                        seen.add(key)
        
        # -------------------------------------------------
        # KEYWORD SEARCH (LOCAL)
        # -------------------------------------------------
        if len(results) < 25:
            bm25 = BM25Retriever.from_documents(chunks)
            bm25.k = 60
            for d in bm25.invoke(query):
                key = d.page_content[:120]
                if key not in seen:
                    results.append(d)
                    seen.add(key)

        # -------------------------------------------------
        # VECTOR SEARCH (PINECONE)
        # -------------------------------------------------
        try:
            embeddings = GoogleGenerativeAIEmbeddings(
                model="models/text-embedding-004"
            )
            store = PineconeVectorStore(
                index_name=INDEX_NAME,
                embedding=embeddings
            )

            vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query)
            for d in vec_docs:
                key = d.page_content[:120]
                if key not in seen:
                    results.append(d)
                    seen.add(key)

        except Exception:
            pass

        return results


# ===============================
# PROMPT
# ===============================
PROMPT_TEMPLATE = """
You are William Marrion Branham, speaking carefully as a teacher and evangelist.

RULES:
- You are speaking to only one person
- Be faithful to the sermons provided.
- Do NOT invent doctrine.
- If something is not clearly stated in the text, say so.
- Use calm 1950s preaching tone.
- Be structured and clear.
- Use headings and bullet points.
- Explain symbols plainly.
- Prefer paraphrase, but preserve meaning.
- Avoid citations like (54) or paragraph numbers.
- Ignore tape noise or filler language.
- If a question asks for a sermon summary, summarize only that sermon.
- If the question references the Seven Seals, prioritize the 1963 series.

CONTEXT:
{context_str}

QUESTION:
{question}

ANSWER:
"""

PROMPT = PromptTemplate(
    template=PROMPT_TEMPLATE,
    input_variables=["context_str", "question"],
)

# ===============================
# PUBLIC API
# ===============================
def get_rag_chain():
    llm = ChatGoogleGenerativeAI(
        model="gemini-2.5-flash",
        temperature=0.25,
        convert_system_message_to_human=True,
    )

    retriever = BranhamRetriever()

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=True,
        chain_type_kwargs={
            "prompt": PROMPT,
            "document_variable_name": "context_str",
        },
        input_key="question",
    )

    return chain


def search_archives(query: str):
    """
    Used by Search mode only.
    Returns (documents, debug_log)
    """
    debug = []
    docs = []
    seen = set()

    chunks = load_chunks()
    query_clean = normalize(query)

    # Keyword search
    for d in chunks:
        if query_clean in d.page_content.lower():
            key = d.page_content[:120]
            if key not in seen:
                docs.append(d)
                seen.add(key)

    debug.append(f"Keyword hits: {len(docs)}")

    # Fallback BM25
    if len(docs) < 20:
        bm25 = BM25Retriever.from_documents(chunks)
        bm25.k = 50
        for d in bm25.invoke(query):
            key = d.page_content[:120]
            if key not in seen:
                docs.append(d)
                seen.add(key)

    debug.append(f"Total results: {len(docs)}")

    return docs, debug