Adoption commited on
Commit
fa4b39b
·
1 Parent(s): d5006d1

feat: implement hybrid BranhamRetriever and downgrade Python base image to 3.11

Browse files
Files changed (3) hide show
  1. .dockerignore +12 -0
  2. Dockerfile +2 -2
  3. src/app.py +187 -62
.dockerignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ venv
3
+ ENV
4
+ env
5
+ __pycache__
6
+ *.py[cod]
7
+ .env
8
+ src/.env
9
+ .streamlit
10
+ .vscode
11
+ .idea
12
+ src/sermon_chunks.zip
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM python:3.13.5-slim
2
 
3
  WORKDIR /app
4
 
@@ -17,4 +17,4 @@ EXPOSE 8501
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ FROM python:3.11-slim
2
 
3
  WORKDIR /app
4
 
 
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
src/app.py CHANGED
@@ -1,10 +1,17 @@
1
  import os
2
  import pickle
3
- from typing import List, Dict, Set
 
 
 
 
 
 
4
  from dotenv import load_dotenv
5
 
6
  from langchain_core.documents import Document
7
- from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 
8
  from langchain_pinecone import PineconeVectorStore
9
  from langchain_community.retrievers import BM25Retriever
10
  from langchain.chains import RetrievalQA
@@ -20,6 +27,15 @@ load_dotenv()
20
  INDEX_NAME = "branham-index"
21
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
22
  CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  # ===============================
@@ -42,13 +58,64 @@ SERIES_GROUPS = {
42
  "seven seals": SEVEN_SEALS_CANON,
43
  }
44
 
 
 
 
 
 
 
 
 
 
 
45
  # ===============================
46
  # HELPERS
47
  # ===============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def normalize(text: str) -> str:
49
  return text.lower().replace("_", " ").replace("-", " ").strip()
50
 
51
 
 
52
  def load_chunks() -> List[Document]:
53
  if not os.path.exists(CHUNKS_FILE):
54
  return []
@@ -56,6 +123,28 @@ def load_chunks() -> List[Document]:
56
  return pickle.load(f)
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def extract_date_code(filename: str) -> str:
60
  """
61
  Assumes filenames start with NN-NNNNE
@@ -64,14 +153,32 @@ def extract_date_code(filename: str) -> str:
64
  return filename.split()[0].replace(".pdf", "")
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  def messagehub_link(filename: str) -> str:
68
  code = extract_date_code(filename)
69
  return f"https://www.messagehub.info/en/read.do?ref_num={code}"
70
 
71
 
72
-
73
- import re
74
-
75
  STOPWORDS = {
76
  "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
77
  }
@@ -117,6 +224,24 @@ def sermon_title_matches(user_query: str, filename: str) -> bool:
117
 
118
  return title_tokens.issubset(query_tokens)
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  # ===============================
121
  # RETRIEVER
122
  # ===============================
@@ -141,100 +266,99 @@ class BranhamRetriever(BaseRetriever):
141
  results: List[Document] = []
142
  seen = set()
143
 
144
- # -------------------------------------------------
145
- # Detect sermon reference (date code)
146
- # -------------------------------------------------
147
- explicit_sermon = None
148
- for token in query.split():
149
- if "-" in token and len(token) >= 7:
150
- explicit_sermon = token.upper()
151
- break
152
 
153
- # -------------------------------------------------
154
- # Detect series
155
- # -------------------------------------------------
156
  target_titles = []
157
- is_series = False
158
 
159
  for key, titles in SERIES_GROUPS.items():
160
  if key in query_clean:
161
  target_titles = titles
162
- is_series = True
163
  break
164
 
165
  # -------------------------------------------------
166
  # SERMON-TARGETED SEARCH
167
  # -------------------------------------------------
168
  if explicit_sermon:
169
- for d in chunks:
170
- src = normalize(d.metadata.get("source", ""))
171
- if sermon_title_matches(explicit_sermon, src):
172
- key = d.page_content[:120]
173
- if key not in seen:
174
- results.append(d)
175
- seen.add(key)
 
 
176
 
177
  # -------------------------------------------------
178
  # SERIES SEARCH
179
  # -------------------------------------------------
180
  elif target_titles:
181
- for d in chunks:
182
- src = normalize(d.metadata.get("source", ""))
183
- if sermon_title_matches(query, src):
184
- key = d.page_content[:120]
185
- if key not in seen:
186
- results.append(d)
187
- seen.add(key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # -------------------------------------------------
190
  # KEYWORD SEARCH (LOCAL)
191
  # -------------------------------------------------
192
- if len(results) < 25:
193
- bm25 = BM25Retriever.from_documents(chunks)
194
- bm25.k = 60
195
- for d in bm25.invoke(query):
196
- key = d.page_content[:120]
197
- if key not in seen:
198
- results.append(d)
199
- seen.add(key)
200
 
201
  # -------------------------------------------------
202
  # VECTOR SEARCH (PINECONE)
203
  # -------------------------------------------------
204
  try:
205
- embeddings = GoogleGenerativeAIEmbeddings(
206
- model="models/text-embedding-004"
207
- )
208
- store = PineconeVectorStore(
209
- index_name=INDEX_NAME,
210
- embedding=embeddings
211
- )
212
-
213
- vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query)
214
  for d in vec_docs:
215
- key = d.page_content[:120]
216
- if key not in seen:
217
- results.append(d)
218
- seen.add(key)
219
 
220
- except Exception:
221
- pass
222
 
223
- return results
224
 
225
 
226
  # ===============================
227
  # PROMPT
228
  # ===============================
229
  PROMPT_TEMPLATE = """
230
- You are William Marrion Branham, speaking carefully as a teacher and evangelist.
 
 
231
 
232
  RULES:
233
  - You are speaking to only one person
234
  - Be faithful to the sermons provided.
235
  - Do NOT invent doctrine.
236
  - If something is not clearly stated in the text, say so.
237
- - Use calm 1950s preaching tone.
238
  - Be structured and clear.
239
  - Use headings and bullet points.
240
  - Explain symbols plainly.
@@ -243,6 +367,9 @@ RULES:
243
  - Ignore tape noise or filler language.
244
  - If a question asks for a sermon summary, summarize only that sermon.
245
  - If the question references the Seven Seals, prioritize the 1963 series.
 
 
 
246
 
247
  CONTEXT:
248
  {context_str}
@@ -309,10 +436,8 @@ def search_archives(query: str):
309
 
310
  # Fallback BM25
311
  if len(docs) < 20:
312
- bm25 = BM25Retriever.from_documents(chunks)
313
- bm25.k = 50
314
- for d in bm25.invoke(query):
315
- key = d.page_content[:120]
316
  if key not in seen:
317
  docs.append(d)
318
  seen.add(key)
 
1
  import os
2
  import pickle
3
+ import json
4
+ import logging
5
+ import re
6
+ import urllib.error
7
+ import urllib.request
8
+ from functools import lru_cache
9
+ from typing import List
10
  from dotenv import load_dotenv
11
 
12
  from langchain_core.documents import Document
13
+ from langchain_core.embeddings import Embeddings
14
+ from langchain_google_genai import ChatGoogleGenerativeAI
15
  from langchain_pinecone import PineconeVectorStore
16
  from langchain_community.retrievers import BM25Retriever
17
  from langchain.chains import RetrievalQA
 
27
  INDEX_NAME = "branham-index"
28
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
29
  CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
30
+ GEMINI_EMBEDDING_MODEL = "models/gemini-embedding-001"
31
+ PINECONE_DIMENSION = 768
32
+ PRIORITY_K = 8
33
+ SERIES_DOCS_PER_SERMON = 2
34
+ BM25_K = 8
35
+ VECTOR_K = 8
36
+ MAX_CONTEXT_DOCS = 20
37
+ MIN_ENTITY_LENGTH = 4
38
+ logger = logging.getLogger(__name__)
39
 
40
 
41
  # ===============================
 
58
  "seven seals": SEVEN_SEALS_CANON,
59
  }
60
 
61
+ SEVEN_SEALS_QUERY_HINTS = {
62
+ "63-0318": "first seal white horse bow crown conquer",
63
+ "63-0319": "second seal red horse sword take peace kill",
64
+ "63-0320": "third seal black horse balances wheat barley oil wine",
65
+ "63-0321": "fourth seal pale horse death hell eagle",
66
+ "63-0322": "fifth seal souls under the altar white robes Jews",
67
+ "63-0323": "sixth seal earthquake sun black moon blood stars fall",
68
+ "63-0324E": "seventh seal silence seven thunders coming of Christ end time",
69
+ }
70
+
71
  # ===============================
72
  # HELPERS
73
  # ===============================
74
+ class GeminiEmbedding768(Embeddings):
75
+ """Google Gemini embeddings constrained to the Pinecone index dimension."""
76
+
77
+ def __init__(self) -> None:
78
+ self.api_key = os.getenv("GOOGLE_API_KEY")
79
+ if not self.api_key:
80
+ raise ValueError("GOOGLE_API_KEY is not set")
81
+
82
+ def _embed(self, text: str) -> List[float]:
83
+ url = (
84
+ "https://generativelanguage.googleapis.com/v1beta/"
85
+ f"{GEMINI_EMBEDDING_MODEL}:embedContent?key={self.api_key}"
86
+ )
87
+ payload = {
88
+ "content": {"parts": [{"text": text}]},
89
+ "outputDimensionality": PINECONE_DIMENSION,
90
+ }
91
+ request = urllib.request.Request(
92
+ url,
93
+ data=json.dumps(payload).encode("utf-8"),
94
+ headers={"Content-Type": "application/json"},
95
+ method="POST",
96
+ )
97
+
98
+ try:
99
+ with urllib.request.urlopen(request, timeout=30) as response:
100
+ data = json.loads(response.read().decode("utf-8"))
101
+ except urllib.error.HTTPError as exc:
102
+ detail = exc.read().decode("utf-8", errors="replace")
103
+ raise RuntimeError(f"Gemini embedding request failed: {detail}") from exc
104
+
105
+ return data["embedding"]["values"]
106
+
107
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
108
+ return [self._embed(text) for text in texts]
109
+
110
+ def embed_query(self, text: str) -> List[float]:
111
+ return self._embed(text)
112
+
113
+
114
  def normalize(text: str) -> str:
115
  return text.lower().replace("_", " ").replace("-", " ").strip()
116
 
117
 
118
+ @lru_cache(maxsize=1)
119
  def load_chunks() -> List[Document]:
120
  if not os.path.exists(CHUNKS_FILE):
121
  return []
 
123
  return pickle.load(f)
124
 
125
 
126
+ @lru_cache(maxsize=1)
127
+ def get_bm25_retriever():
128
+ bm25 = BM25Retriever.from_documents(load_chunks())
129
+ bm25.k = BM25_K
130
+ return bm25
131
+
132
+
133
+ @lru_cache(maxsize=1)
134
+ def get_search_bm25_retriever():
135
+ bm25 = BM25Retriever.from_documents(load_chunks())
136
+ bm25.k = 50
137
+ return bm25
138
+
139
+
140
+ @lru_cache(maxsize=1)
141
+ def get_vector_store():
142
+ return PineconeVectorStore(
143
+ index_name=INDEX_NAME,
144
+ embedding=GeminiEmbedding768(),
145
+ )
146
+
147
+
148
  def extract_date_code(filename: str) -> str:
149
  """
150
  Assumes filenames start with NN-NNNNE
 
153
  return filename.split()[0].replace(".pdf", "")
154
 
155
 
156
+ def extract_query_sermon_code(query: str) -> str | None:
157
+ match = re.search(r"\b\d{2}-\d{4}[A-Z]?\b", query.upper())
158
+ return match.group(0) if match else None
159
+
160
+
161
+ def source_matches_code(source: str, code: str) -> bool:
162
+ return extract_date_code(source).upper() == code.upper()
163
+
164
+
165
+ def source_matches_any_code(source: str, codes: set[str]) -> bool:
166
+ return extract_date_code(source).upper() in codes
167
+
168
+
169
+ def document_key(doc: Document) -> tuple[str, str, str]:
170
+ return (
171
+ doc.metadata.get("source", ""),
172
+ str(doc.metadata.get("paragraph", "")),
173
+ doc.page_content[:120],
174
+ )
175
+
176
+
177
  def messagehub_link(filename: str) -> str:
178
  code = extract_date_code(filename)
179
  return f"https://www.messagehub.info/en/read.do?ref_num={code}"
180
 
181
 
 
 
 
182
  STOPWORDS = {
183
  "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
184
  }
 
224
 
225
  return title_tokens.issubset(query_tokens)
226
 
227
+
228
+ def query_entity_tokens(query: str) -> set[str]:
229
+ return {
230
+ token for token in tokenize_meaningful(query)
231
+ if len(token) >= MIN_ENTITY_LENGTH
232
+ }
233
+
234
+
235
+ def rank_by_query_terms(docs: List[Document], query: str) -> List[Document]:
236
+ terms = query_entity_tokens(query)
237
+
238
+ def score(doc: Document) -> tuple[int, int]:
239
+ text = normalize_text(doc.page_content)
240
+ hits = sum(1 for term in terms if term in text)
241
+ return hits, -len(text)
242
+
243
+ return sorted(docs, key=score, reverse=True)
244
+
245
  # ===============================
246
  # RETRIEVER
247
  # ===============================
 
266
  results: List[Document] = []
267
  seen = set()
268
 
269
+ def add_doc(doc: Document) -> bool:
270
+ key = document_key(doc)
271
+ if key in seen:
272
+ return False
273
+ results.append(doc)
274
+ seen.add(key)
275
+ return True
 
276
 
277
+ explicit_sermon = extract_query_sermon_code(query)
 
 
278
  target_titles = []
 
279
 
280
  for key, titles in SERIES_GROUPS.items():
281
  if key in query_clean:
282
  target_titles = titles
 
283
  break
284
 
285
  # -------------------------------------------------
286
  # SERMON-TARGETED SEARCH
287
  # -------------------------------------------------
288
  if explicit_sermon:
289
+ sermon_chunks = [
290
+ d for d in chunks
291
+ if source_matches_code(d.metadata.get("source", ""), explicit_sermon)
292
+ ]
293
+ if sermon_chunks:
294
+ sermon_bm25 = BM25Retriever.from_documents(sermon_chunks)
295
+ sermon_bm25.k = PRIORITY_K
296
+ for d in sermon_bm25.invoke(query):
297
+ add_doc(d)
298
 
299
  # -------------------------------------------------
300
  # SERIES SEARCH
301
  # -------------------------------------------------
302
  elif target_titles:
303
+ for title in target_titles:
304
+ target_code = extract_date_code(title).upper()
305
+ sermon_chunks = [
306
+ d for d in chunks
307
+ if source_matches_code(d.metadata.get("source", ""), target_code)
308
+ ]
309
+ if sermon_chunks:
310
+ series_query = SEVEN_SEALS_QUERY_HINTS.get(target_code, query)
311
+ for d in rank_by_query_terms(sermon_chunks, series_query)[:SERIES_DOCS_PER_SERMON]:
312
+ add_doc(d)
313
+
314
+ # -------------------------------------------------
315
+ # SERMON-TITLE SEARCH
316
+ # -------------------------------------------------
317
+ else:
318
+ title_chunks = [
319
+ d for d in chunks
320
+ if sermon_title_matches(query, d.metadata.get("source", ""))
321
+ ]
322
+ if title_chunks:
323
+ for d in rank_by_query_terms(title_chunks, query)[:PRIORITY_K]:
324
+ add_doc(d)
325
 
326
  # -------------------------------------------------
327
  # KEYWORD SEARCH (LOCAL)
328
  # -------------------------------------------------
329
+ for d in get_bm25_retriever().invoke(query):
330
+ add_doc(d)
 
 
 
 
 
 
331
 
332
  # -------------------------------------------------
333
  # VECTOR SEARCH (PINECONE)
334
  # -------------------------------------------------
335
  try:
336
+ vec_docs = get_vector_store().as_retriever(
337
+ search_kwargs={"k": VECTOR_K}
338
+ ).invoke(query)
 
 
 
 
 
 
339
  for d in vec_docs:
340
+ add_doc(d)
 
 
 
341
 
342
+ except Exception as exc:
343
+ logger.warning("Pinecone vector search failed: %s", exc)
344
 
345
+ return results[:MAX_CONTEXT_DOCS]
346
 
347
 
348
  # ===============================
349
  # PROMPT
350
  # ===============================
351
  PROMPT_TEMPLATE = """
352
+ You are a careful research assistant helping one person understand William Marrion Branham's sermons.
353
+ Do not roleplay, impersonate, or speak as William Marrion Branham.
354
+ Always refer to him in the third person as "Brother Branham" or "William Branham."
355
 
356
  RULES:
357
  - You are speaking to only one person
358
  - Be faithful to the sermons provided.
359
  - Do NOT invent doctrine.
360
  - If something is not clearly stated in the text, say so.
361
+ - Use a respectful explanatory tone, not a preaching or prophetic persona.
362
  - Be structured and clear.
363
  - Use headings and bullet points.
364
  - Explain symbols plainly.
 
367
  - Ignore tape noise or filler language.
368
  - If a question asks for a sermon summary, summarize only that sermon.
369
  - If the question references the Seven Seals, prioritize the 1963 series.
370
+ - Phrase claims as "Brother Branham said/taught/explained..." when the context supports them.
371
+ - Do not say "I testified," "my ministry," "my dear friend," or anything that makes the assistant sound like Brother Branham.
372
+ - If context uses first-person sermon language, convert it to third-person attribution.
373
 
374
  CONTEXT:
375
  {context_str}
 
436
 
437
  # Fallback BM25
438
  if len(docs) < 20:
439
+ for d in get_search_bm25_retriever().invoke(query):
440
+ key = document_key(d)
 
 
441
  if key not in seen:
442
  docs.append(d)
443
  seen.add(key)