Spaces:

Adoption
/

7th_handle

Sleeping

App Files Files Community

Adoption commited on 27 days ago

Commit

fa4b39b

1 Parent(s): d5006d1

feat: implement hybrid BranhamRetriever and downgrade Python base image to 3.11

Browse files

Files changed (3) hide show

.dockerignore +12 -0
Dockerfile +2 -2
src/app.py +187 -62

.dockerignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.git
+venv
+ENV
+env
+__pycache__
+*.py[cod]
+.env
+src/.env
+.streamlit
+.vscode
+.idea
+src/sermon_chunks.zip

Dockerfile CHANGED Viewed

@@ -1,4 +1,4 @@
-FROM python:3.13.5-slim
 WORKDIR /app
@@ -17,4 +17,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.11-slim
 WORKDIR /app
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

src/app.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import os
 import pickle
-from typing import List, Dict, Set
 from dotenv import load_dotenv
 from langchain_core.documents import Document
-from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain_pinecone import PineconeVectorStore
 from langchain_community.retrievers import BM25Retriever
 from langchain.chains import RetrievalQA
@@ -20,6 +27,15 @@ load_dotenv()
 INDEX_NAME = "branham-index"
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
 # ===============================
@@ -42,13 +58,64 @@ SERIES_GROUPS = {
     "seven seals": SEVEN_SEALS_CANON,
 }
 # ===============================
 # HELPERS
 # ===============================
 def normalize(text: str) -> str:
     return text.lower().replace("_", " ").replace("-", " ").strip()
 def load_chunks() -> List[Document]:
     if not os.path.exists(CHUNKS_FILE):
         return []
@@ -56,6 +123,28 @@ def load_chunks() -> List[Document]:
         return pickle.load(f)
 def extract_date_code(filename: str) -> str:
     """
     Assumes filenames start with NN-NNNNE
@@ -64,14 +153,32 @@ def extract_date_code(filename: str) -> str:
     return filename.split()[0].replace(".pdf", "")
 def messagehub_link(filename: str) -> str:
     code = extract_date_code(filename)
     return f"https://www.messagehub.info/en/read.do?ref_num={code}"
-import re
 STOPWORDS = {
     "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
 }
@@ -117,6 +224,24 @@ def sermon_title_matches(user_query: str, filename: str) -> bool:
     return title_tokens.issubset(query_tokens)
 # ===============================
 # RETRIEVER
 # ===============================
@@ -141,100 +266,99 @@ class BranhamRetriever(BaseRetriever):
         results: List[Document] = []
         seen = set()
-        # -------------------------------------------------
-        # Detect sermon reference (date code)
-        # -------------------------------------------------
-        explicit_sermon = None
-        for token in query.split():
-            if "-" in token and len(token) >= 7:
-                explicit_sermon = token.upper()
-                break
-        # -------------------------------------------------
-        # Detect series
-        # -------------------------------------------------
         target_titles = []
-        is_series = False
         for key, titles in SERIES_GROUPS.items():
             if key in query_clean:
                 target_titles = titles
-                is_series = True
                 break
         # -------------------------------------------------
         # SERMON-TARGETED SEARCH
         # -------------------------------------------------
         if explicit_sermon:
-            for d in chunks:
-                src = normalize(d.metadata.get("source", ""))
-                if sermon_title_matches(explicit_sermon, src):
-                    key = d.page_content[:120]
-                    if key not in seen:
-                        results.append(d)
-                        seen.add(key)
         # -------------------------------------------------
         # SERIES SEARCH
         # -------------------------------------------------
         elif target_titles:
-            for d in chunks:
-                src = normalize(d.metadata.get("source", ""))
-                if sermon_title_matches(query, src):
-                    key = d.page_content[:120]
-                    if key not in seen:
-                        results.append(d)
-                        seen.add(key)
         # -------------------------------------------------
         # KEYWORD SEARCH (LOCAL)
         # -------------------------------------------------
-        if len(results) < 25:
-            bm25 = BM25Retriever.from_documents(chunks)
-            bm25.k = 60
-            for d in bm25.invoke(query):
-                key = d.page_content[:120]
-                if key not in seen:
-                    results.append(d)
-                    seen.add(key)
         # -------------------------------------------------
         # VECTOR SEARCH (PINECONE)
         # -------------------------------------------------
         try:
-            embeddings = GoogleGenerativeAIEmbeddings(
-                model="models/text-embedding-004"
-            )
-            store = PineconeVectorStore(
-                index_name=INDEX_NAME,
-                embedding=embeddings
-            )
-            vec_docs = store.as_retriever(search_kwargs={"k": 30}).invoke(query)
             for d in vec_docs:
-                key = d.page_content[:120]
-                if key not in seen:
-                    results.append(d)
-                    seen.add(key)
-        except Exception:
-            pass
-        return results
 # ===============================
 # PROMPT
 # ===============================
 PROMPT_TEMPLATE = """
-You are William Marrion Branham, speaking carefully as a teacher and evangelist.
 RULES:
 - You are speaking to only one person
 - Be faithful to the sermons provided.
 - Do NOT invent doctrine.
 - If something is not clearly stated in the text, say so.
-- Use calm 1950s preaching tone.
 - Be structured and clear.
 - Use headings and bullet points.
 - Explain symbols plainly.
@@ -243,6 +367,9 @@ RULES:
 - Ignore tape noise or filler language.
 - If a question asks for a sermon summary, summarize only that sermon.
 - If the question references the Seven Seals, prioritize the 1963 series.
 CONTEXT:
 {context_str}
@@ -309,10 +436,8 @@ def search_archives(query: str):
     # Fallback BM25
     if len(docs) < 20:
-        bm25 = BM25Retriever.from_documents(chunks)
-        bm25.k = 50
-        for d in bm25.invoke(query):
-            key = d.page_content[:120]
             if key not in seen:
                 docs.append(d)
                 seen.add(key)

 import os
 import pickle
+import json
+import logging
+import re
+import urllib.error
+import urllib.request
+from functools import lru_cache
+from typing import List
 from dotenv import load_dotenv
 from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_pinecone import PineconeVectorStore
 from langchain_community.retrievers import BM25Retriever
 from langchain.chains import RetrievalQA
 INDEX_NAME = "branham-index"
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
+GEMINI_EMBEDDING_MODEL = "models/gemini-embedding-001"
+PINECONE_DIMENSION = 768
+PRIORITY_K = 8
+SERIES_DOCS_PER_SERMON = 2
+BM25_K = 8
+VECTOR_K = 8
+MAX_CONTEXT_DOCS = 20
+MIN_ENTITY_LENGTH = 4
+logger = logging.getLogger(__name__)
 # ===============================
     "seven seals": SEVEN_SEALS_CANON,
 }
+SEVEN_SEALS_QUERY_HINTS = {
+    "63-0318": "first seal white horse bow crown conquer",
+    "63-0319": "second seal red horse sword take peace kill",
+    "63-0320": "third seal black horse balances wheat barley oil wine",
+    "63-0321": "fourth seal pale horse death hell eagle",
+    "63-0322": "fifth seal souls under the altar white robes Jews",
+    "63-0323": "sixth seal earthquake sun black moon blood stars fall",
+    "63-0324E": "seventh seal silence seven thunders coming of Christ end time",
+}
 # ===============================
 # HELPERS
 # ===============================
+class GeminiEmbedding768(Embeddings):
+    """Google Gemini embeddings constrained to the Pinecone index dimension."""
+    def __init__(self) -> None:
+        self.api_key = os.getenv("GOOGLE_API_KEY")
+        if not self.api_key:
+            raise ValueError("GOOGLE_API_KEY is not set")
+    def _embed(self, text: str) -> List[float]:
+        url = (
+            "https://generativelanguage.googleapis.com/v1beta/"
+            f"{GEMINI_EMBEDDING_MODEL}:embedContent?key={self.api_key}"
+        )
+        payload = {
+            "content": {"parts": [{"text": text}]},
+            "outputDimensionality": PINECONE_DIMENSION,
+        }
+        request = urllib.request.Request(
+            url,
+            data=json.dumps(payload).encode("utf-8"),
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        try:
+            with urllib.request.urlopen(request, timeout=30) as response:
+                data = json.loads(response.read().decode("utf-8"))
+        except urllib.error.HTTPError as exc:
+            detail = exc.read().decode("utf-8", errors="replace")
+            raise RuntimeError(f"Gemini embedding request failed: {detail}") from exc
+        return data["embedding"]["values"]
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        return [self._embed(text) for text in texts]
+    def embed_query(self, text: str) -> List[float]:
+        return self._embed(text)
 def normalize(text: str) -> str:
     return text.lower().replace("_", " ").replace("-", " ").strip()
+@lru_cache(maxsize=1)
 def load_chunks() -> List[Document]:
     if not os.path.exists(CHUNKS_FILE):
         return []
         return pickle.load(f)
+@lru_cache(maxsize=1)
+def get_bm25_retriever():
+    bm25 = BM25Retriever.from_documents(load_chunks())
+    bm25.k = BM25_K
+    return bm25
+@lru_cache(maxsize=1)
+def get_search_bm25_retriever():
+    bm25 = BM25Retriever.from_documents(load_chunks())
+    bm25.k = 50
+    return bm25
+@lru_cache(maxsize=1)
+def get_vector_store():
+    return PineconeVectorStore(
+        index_name=INDEX_NAME,
+        embedding=GeminiEmbedding768(),
+    )
 def extract_date_code(filename: str) -> str:
     """
     Assumes filenames start with NN-NNNNE
     return filename.split()[0].replace(".pdf", "")
+def extract_query_sermon_code(query: str) -> str | None:
+    match = re.search(r"\b\d{2}-\d{4}[A-Z]?\b", query.upper())
+    return match.group(0) if match else None
+def source_matches_code(source: str, code: str) -> bool:
+    return extract_date_code(source).upper() == code.upper()
+def source_matches_any_code(source: str, codes: set[str]) -> bool:
+    return extract_date_code(source).upper() in codes
+def document_key(doc: Document) -> tuple[str, str, str]:
+    return (
+        doc.metadata.get("source", ""),
+        str(doc.metadata.get("paragraph", "")),
+        doc.page_content[:120],
+    )
 def messagehub_link(filename: str) -> str:
     code = extract_date_code(filename)
     return f"https://www.messagehub.info/en/read.do?ref_num={code}"
 STOPWORDS = {
     "the", "a", "an", "of", "in", "on", "at", "and", "to", "for", "with", "by"
 }
     return title_tokens.issubset(query_tokens)
+def query_entity_tokens(query: str) -> set[str]:
+    return {
+        token for token in tokenize_meaningful(query)
+        if len(token) >= MIN_ENTITY_LENGTH
+    }
+def rank_by_query_terms(docs: List[Document], query: str) -> List[Document]:
+    terms = query_entity_tokens(query)
+    def score(doc: Document) -> tuple[int, int]:
+        text = normalize_text(doc.page_content)
+        hits = sum(1 for term in terms if term in text)
+        return hits, -len(text)
+    return sorted(docs, key=score, reverse=True)
 # ===============================
 # RETRIEVER
 # ===============================
         results: List[Document] = []
         seen = set()
+        def add_doc(doc: Document) -> bool:
+            key = document_key(doc)
+            if key in seen:
+                return False
+            results.append(doc)
+            seen.add(key)
+            return True
+        explicit_sermon = extract_query_sermon_code(query)
         target_titles = []
         for key, titles in SERIES_GROUPS.items():
             if key in query_clean:
                 target_titles = titles
                 break
         # -------------------------------------------------
         # SERMON-TARGETED SEARCH
         # -------------------------------------------------
         if explicit_sermon:
+            sermon_chunks = [
+                d for d in chunks
+                if source_matches_code(d.metadata.get("source", ""), explicit_sermon)
+            ]
+            if sermon_chunks:
+                sermon_bm25 = BM25Retriever.from_documents(sermon_chunks)
+                sermon_bm25.k = PRIORITY_K
+                for d in sermon_bm25.invoke(query):
+                    add_doc(d)
         # -------------------------------------------------
         # SERIES SEARCH
         # -------------------------------------------------
         elif target_titles:
+            for title in target_titles:
+                target_code = extract_date_code(title).upper()
+                sermon_chunks = [
+                    d for d in chunks
+                    if source_matches_code(d.metadata.get("source", ""), target_code)
+                ]
+                if sermon_chunks:
+                    series_query = SEVEN_SEALS_QUERY_HINTS.get(target_code, query)
+                    for d in rank_by_query_terms(sermon_chunks, series_query)[:SERIES_DOCS_PER_SERMON]:
+                        add_doc(d)
+        # -------------------------------------------------
+        # SERMON-TITLE SEARCH
+        # -------------------------------------------------
+        else:
+            title_chunks = [
+                d for d in chunks
+                if sermon_title_matches(query, d.metadata.get("source", ""))
+            ]
+            if title_chunks:
+                for d in rank_by_query_terms(title_chunks, query)[:PRIORITY_K]:
+                    add_doc(d)
         # -------------------------------------------------
         # KEYWORD SEARCH (LOCAL)
         # -------------------------------------------------
+        for d in get_bm25_retriever().invoke(query):
+            add_doc(d)
         # -------------------------------------------------
         # VECTOR SEARCH (PINECONE)
         # -------------------------------------------------
         try:
+            vec_docs = get_vector_store().as_retriever(
+                search_kwargs={"k": VECTOR_K}
+            ).invoke(query)
             for d in vec_docs:
+                add_doc(d)
+        except Exception as exc:
+            logger.warning("Pinecone vector search failed: %s", exc)
+        return results[:MAX_CONTEXT_DOCS]
 # ===============================
 # PROMPT
 # ===============================
 PROMPT_TEMPLATE = """
+You are a careful research assistant helping one person understand William Marrion Branham's sermons.
+Do not roleplay, impersonate, or speak as William Marrion Branham.
+Always refer to him in the third person as "Brother Branham" or "William Branham."
 RULES:
 - You are speaking to only one person
 - Be faithful to the sermons provided.
 - Do NOT invent doctrine.
 - If something is not clearly stated in the text, say so.
+- Use a respectful explanatory tone, not a preaching or prophetic persona.
 - Be structured and clear.
 - Use headings and bullet points.
 - Explain symbols plainly.
 - Ignore tape noise or filler language.
 - If a question asks for a sermon summary, summarize only that sermon.
 - If the question references the Seven Seals, prioritize the 1963 series.
+- Phrase claims as "Brother Branham said/taught/explained..." when the context supports them.
+- Do not say "I testified," "my ministry," "my dear friend," or anything that makes the assistant sound like Brother Branham.
+- If context uses first-person sermon language, convert it to third-person attribution.
 CONTEXT:
 {context_str}
     # Fallback BM25
     if len(docs) < 20:
+        for d in get_search_bm25_retriever().invoke(query):
+            key = document_key(d)
             if key not in seen:
                 docs.append(d)
                 seen.add(key)