Spaces:

Adoption
/

7th_handle

Sleeping

App Files Files Community

Adoption commited on Dec 31, 2025

Commit

73a13e0

verified ·

1 Parent(s): 4fafa21

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +31 -31

src/app.py CHANGED Viewed

@@ -21,13 +21,12 @@ INDEX_NAME = "branham-index"
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
-# Verify Local File
 if not os.path.exists(CHUNKS_FILE):
     print(f"⚠️ WARNING: Pickle file not found at: {CHUNKS_FILE}")
 else:
     print(f"✅ SUCCESS: Pickle file found at: {CHUNKS_FILE}")
-# --- SEARCH ENGINE (PURE LOCAL - NO VECTORS) ---
 def search_archives(query):
     status_log = []
     results = []
@@ -38,21 +37,25 @@ def search_archives(query):
                 chunks = pickle.load(f)
             status_log.append(f"📂 Scanning {len(chunks)} local paragraphs...")
-            query_lower = query.lower().strip()
-            # STRATEGY 1: FILENAME MATCH (Priority)
-            # If query is "First Seal", grab paragraphs from "63-0318 The First Seal.pdf"
-            filename_matches = [
-                doc for doc in chunks
-                if query_lower in doc.metadata.get('source', '').lower()
-            ]
             if filename_matches:
                 status_log.append(f"📼 Found {len(filename_matches)} chunks from specific Tape(s).")
                 results.extend(filename_matches)
-            # STRATEGY 2: CONTENT MATCH
-            # Also grab exact text matches
             content_matches = [
                 doc for doc in chunks
                 if query_lower in doc.page_content.lower()
@@ -63,7 +66,6 @@ def search_archives(query):
             unique_results = []
             seen_ids = set()
             for doc in results:
-                # Create a unique signature for the doc
                 sig = doc.page_content[:50]
                 if sig not in seen_ids:
                     unique_results.append(doc)
@@ -86,7 +88,7 @@ def search_archives(query):
         status_log.append("❌ Pickle file missing. Cannot search.")
         return [], status_log
-# --- RAG CHAIN (The Chat Tool - SERMON AWARE) ---
 def get_rag_chain():
     class SmartRetriever(BaseRetriever):
@@ -102,27 +104,26 @@ def get_rag_chain():
                     with open(CHUNKS_FILE, "rb") as f:
                         chunks = pickle.load(f)
-                    query_lower = query.lower()
-                    # --- PRIORITY 1: IS IT A SERMON TITLE? ---
-                    # If the user asks about "The First Seal", we want chunks FROM that tape.
-                    title_matches = [
-                        doc for doc in chunks
-                        if query_lower in doc.metadata.get('source', '').lower()
-                    ]
                     if title_matches:
-                        print(f"📼 Identified Sermon Title Match! Added {len(title_matches)} chunks from the specific tape.")
-                        # Add a good spread of chunks from the sermon (up to 40)
-                        # We take the *middle* chunks usually, as that's where the teaching is.
-                        # For simplicity, we take the first 40 found.
-                        for doc in title_matches[:40]:
                             if doc.page_content not in seen_content:
                                 final_docs.append(doc)
                                 seen_content.add(doc.page_content)
-                    # --- PRIORITY 2: BM25 KEYWORD SEARCH ---
-                    # We still run this to find cross-references in other tapes
                     keyword_retriever = BM25Retriever.from_documents(chunks)
                     keyword_retriever.k = 40
                     local_matches = keyword_retriever.invoke(query)
@@ -135,8 +136,7 @@ def get_rag_chain():
                 except Exception as e:
                     print(f"⚠️ Local Search Warning: {e}")
-            # --- PRIORITY 3: CLOUD LOOKUP ---
-            print("☁️ Checking Cloud...")
             try:
                 embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
                 vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
@@ -167,7 +167,7 @@ You are a doctrinal study assistant for William Branham's Message teachings.
 INSTRUCTIONS:
 1.  **Read the Context:** I have provided quotes from the sermons.
-2.  **Sermon Focus:** If the User asks for a summary of a SPECIFIC sermon (e.g., "The First Seal"), focus on the TEACHING of that sermon (the symbols, the meaning, the revelation), not just the introduction or history of it.
 3.  **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
 4.  **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
 5.  **Accuracy:** Stick strictly to what the quotes say.

 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
 if not os.path.exists(CHUNKS_FILE):
     print(f"⚠️ WARNING: Pickle file not found at: {CHUNKS_FILE}")
 else:
     print(f"✅ SUCCESS: Pickle file found at: {CHUNKS_FILE}")
+# --- SEARCH ENGINE (SMART MATCHING) ---
 def search_archives(query):
     status_log = []
     results = []
                 chunks = pickle.load(f)
             status_log.append(f"📂 Scanning {len(chunks)} local paragraphs...")
+            # NORMALIZE QUERY: "The First Seal" -> "the first seal"
+            query_lower = query.lower().strip()
+            # STRATEGY 1: FILENAME MATCH (Ignore Underscores)
+            filename_matches = []
+            for doc in chunks:
+                # Get filename, lowercase it, replace underscores with spaces
+                # "63_0318_The_First_Seal.pdf" -> "63 0318 the first seal pdf"
+                fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
+                if query_lower in fname_clean:
+                    filename_matches.append(doc)
             if filename_matches:
                 status_log.append(f"📼 Found {len(filename_matches)} chunks from specific Tape(s).")
                 results.extend(filename_matches)
+            # STRATEGY 2: CONTENT MATCH (Standard)
             content_matches = [
                 doc for doc in chunks
                 if query_lower in doc.page_content.lower()
             unique_results = []
             seen_ids = set()
             for doc in results:
                 sig = doc.page_content[:50]
                 if sig not in seen_ids:
                     unique_results.append(doc)
         status_log.append("❌ Pickle file missing. Cannot search.")
         return [], status_log
+# --- RAG CHAIN (SMART RETRIEVER) ---
 def get_rag_chain():
     class SmartRetriever(BaseRetriever):
                     with open(CHUNKS_FILE, "rb") as f:
                         chunks = pickle.load(f)
+                    query_clean = query.lower().strip()
+                    # --- PRIORITY 1: SMART FILENAME MATCH ---
+                    # Replaces underscores so "First Seal" matches "First_Seal"
+                    title_matches = []
+                    for doc in chunks:
+                        fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
+                        if query_clean in fname_clean:
+                            title_matches.append(doc)
                     if title_matches:
+                        print(f"📼 Sermon Title Match! Added {len(title_matches)} chunks.")
+                        # If the user asked for a specific tape, GIVE THEM THE TAPE.
+                        # We add up to 60 chunks from that specific tape to ensure the AI reads the whole thing.
+                        for doc in title_matches[:60]:
                             if doc.page_content not in seen_content:
                                 final_docs.append(doc)
                                 seen_content.add(doc.page_content)
+                    # --- PRIORITY 2: BM25 SEARCH ---
                     keyword_retriever = BM25Retriever.from_documents(chunks)
                     keyword_retriever.k = 40
                     local_matches = keyword_retriever.invoke(query)
                 except Exception as e:
                     print(f"⚠️ Local Search Warning: {e}")
+            # --- PRIORITY 3: CLOUD ---
             try:
                 embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
                 vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
 INSTRUCTIONS:
 1.  **Read the Context:** I have provided quotes from the sermons.
+2.  **Sermon Focus:** If the quotes come from a SPECIFIC sermon the user asked about (e.g., "The First Seal"), summarize the MAIN TEACHING of that sermon (the symbols, the revelation), not just the intro.
 3.  **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
 4.  **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
 5.  **Accuracy:** Stick strictly to what the quotes say.