Spaces:

Adoption
/

7th_handle

Sleeping

App Files Files Community

Adoption commited on Dec 31, 2025

Commit

4fafa21

verified ·

1 Parent(s): 25c058b

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +67 -35

src/app.py CHANGED Viewed

@@ -29,10 +29,6 @@ else:
 # --- SEARCH ENGINE (PURE LOCAL - NO VECTORS) ---
 def search_archives(query):
-    """
-    Search Mode: Scans local files strictly.
-    Returns ALL matches found (up to 1000).
-    """
     status_log = []
     results = []
@@ -44,18 +40,44 @@ def search_archives(query):
             status_log.append(f"📂 Scanning {len(chunks)} local paragraphs...")
             query_lower = query.lower().strip()
-            # Find ALL Matches (No Limit)
-            results = [doc for doc in chunks if query_lower in doc.page_content.lower()]
             # Safety Check
-            total_found = len(results)
             if total_found > 1000:
-                results = results[:1000]
-                status_log.append(f"⚠️ Found {total_found} matches! Showing first 1000 to prevent crash.")
             else:
-                status_log.append(f"✅ Found {total_found} exact matches.")
-            return results, status_log
         except Exception as e:
             status_log.append(f"❌ Local Load Error: {e}")
@@ -64,7 +86,7 @@ def search_archives(query):
         status_log.append("❌ Pickle file missing. Cannot search.")
         return [], status_log
-# --- RAG CHAIN (The Chat Tool - BIG CONTEXT MODE) ---
 def get_rag_chain():
     class SmartRetriever(BaseRetriever):
@@ -75,70 +97,80 @@ def get_rag_chain():
             final_docs = []
             seen_content = set()
-            # --- PHASE A: LOCAL LOOKUP (BM25 - TOP 60) ---
             if os.path.exists(CHUNKS_FILE):
                 try:
                     with open(CHUNKS_FILE, "rb") as f:
                         chunks = pickle.load(f)
-                    # BM25 is better than simple keywords. It finds "First Seal" even if you type "list seals".
                     keyword_retriever = BM25Retriever.from_documents(chunks)
-                    keyword_retriever.k = 60  # GRAB 60 CHUNKS!
                     local_matches = keyword_retriever.invoke(query)
                     for doc in local_matches:
                         if doc.page_content not in seen_content:
                             final_docs.append(doc)
                             seen_content.add(doc.page_content)
                 except Exception as e:
                     print(f"⚠️ Local Search Warning: {e}")
-            # --- PHASE B: CLOUD LOOKUP (TOP 40 - NO FILTERS) ---
             print("☁️ Checking Cloud...")
             try:
                 embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
                 vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
-                # We removed the year filter so it finds everything relevant.
-                retriever = vector_store.as_retriever(search_kwargs={"k": 40})
                 cloud_docs = retriever.invoke(query)
                 for doc in cloud_docs:
                     if doc.page_content not in seen_content:
                         final_docs.append(doc)
                         seen_content.add(doc.page_content)
-                print(f"✅ Added {len(cloud_docs)} cloud matches.")
             except Exception as e:
                 print(f"❌ Cloud Error: {e}")
-            # NO RERANKING. Just send all 100 docs to the AI.
             return final_docs
-    # 2. SETUP LLM (Gemini 1.5 Pro)
     google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
     os.environ["GOOGLE_API_KEY"] = google_key
-    # "gemini-1.5-pro-latest" has a huge context window. It can handle this load easily.
     llm = ChatGoogleGenerativeAI(
         model="gemini-2.5-flash",
         temperature=0.3,
         convert_system_message_to_human=True
     )
-    # 3. PROMPT (NATURAL & ACCURATE)
     template = """
 You are a doctrinal study assistant for William Branham's Message teachings.
-Your goal is to answer the user's question by synthesizing the provided CONTEXT into a smooth, easy-to-read explanation.
 INSTRUCTIONS:
-1.  **Read the Context:** Look at all the provided quotes (there are many).
-2.  **Identify the Answer:** Even if the answer is spread across multiple quotes, piece it together.
-3.  **Natural Tone:** Write in normal, comfortable paragraphs.
-4.  **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12). Just tell the truth of what is written.
-5.  **Accuracy:** Do not add a different meaning. Stick strictly to what the quotes say.
-6.  **Correction Logic:** If the context contains a later correction (e.g., from the Seven Seals), prioritize that explanation.
 CONTEXT:
 {context_str}
@@ -152,7 +184,7 @@ ANSWER:
     chain = RetrievalQA.from_chain_type(
         llm=llm,
-        chain_type="stuff", # "Stuff" puts all 100 docs in at once.
         retriever=SmartRetriever(),
         return_source_documents=True,
         chain_type_kwargs={"prompt": PROMPT, "document_variable_name": "context_str"},

 # --- SEARCH ENGINE (PURE LOCAL - NO VECTORS) ---
 def search_archives(query):
     status_log = []
     results = []
             status_log.append(f"📂 Scanning {len(chunks)} local paragraphs...")
             query_lower = query.lower().strip()
+            # STRATEGY 1: FILENAME MATCH (Priority)
+            # If query is "First Seal", grab paragraphs from "63-0318 The First Seal.pdf"
+            filename_matches = [
+                doc for doc in chunks
+                if query_lower in doc.metadata.get('source', '').lower()
+            ]
+            if filename_matches:
+                status_log.append(f"📼 Found {len(filename_matches)} chunks from specific Tape(s).")
+                results.extend(filename_matches)
+            # STRATEGY 2: CONTENT MATCH
+            # Also grab exact text matches
+            content_matches = [
+                doc for doc in chunks
+                if query_lower in doc.page_content.lower()
+            ]
+            results.extend(content_matches)
+            # Deduplicate
+            unique_results = []
+            seen_ids = set()
+            for doc in results:
+                # Create a unique signature for the doc
+                sig = doc.page_content[:50]
+                if sig not in seen_ids:
+                    unique_results.append(doc)
+                    seen_ids.add(sig)
             # Safety Check
+            total_found = len(unique_results)
             if total_found > 1000:
+                unique_results = unique_results[:1000]
+                status_log.append(f"⚠️ Found {total_found} matches! Showing first 1000.")
             else:
+                status_log.append(f"✅ Found {total_found} unique matches.")
+            return unique_results, status_log
         except Exception as e:
             status_log.append(f"❌ Local Load Error: {e}")
         status_log.append("❌ Pickle file missing. Cannot search.")
         return [], status_log
+# --- RAG CHAIN (The Chat Tool - SERMON AWARE) ---
 def get_rag_chain():
     class SmartRetriever(BaseRetriever):
             final_docs = []
             seen_content = set()
             if os.path.exists(CHUNKS_FILE):
                 try:
                     with open(CHUNKS_FILE, "rb") as f:
                         chunks = pickle.load(f)
+                    query_lower = query.lower()
+                    # --- PRIORITY 1: IS IT A SERMON TITLE? ---
+                    # If the user asks about "The First Seal", we want chunks FROM that tape.
+                    title_matches = [
+                        doc for doc in chunks
+                        if query_lower in doc.metadata.get('source', '').lower()
+                    ]
+                    if title_matches:
+                        print(f"📼 Identified Sermon Title Match! Added {len(title_matches)} chunks from the specific tape.")
+                        # Add a good spread of chunks from the sermon (up to 40)
+                        # We take the *middle* chunks usually, as that's where the teaching is.
+                        # For simplicity, we take the first 40 found.
+                        for doc in title_matches[:40]:
+                            if doc.page_content not in seen_content:
+                                final_docs.append(doc)
+                                seen_content.add(doc.page_content)
+                    # --- PRIORITY 2: BM25 KEYWORD SEARCH ---
+                    # We still run this to find cross-references in other tapes
                     keyword_retriever = BM25Retriever.from_documents(chunks)
+                    keyword_retriever.k = 40
                     local_matches = keyword_retriever.invoke(query)
                     for doc in local_matches:
                         if doc.page_content not in seen_content:
                             final_docs.append(doc)
                             seen_content.add(doc.page_content)
                 except Exception as e:
                     print(f"⚠️ Local Search Warning: {e}")
+            # --- PRIORITY 3: CLOUD LOOKUP ---
             print("☁️ Checking Cloud...")
             try:
                 embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
                 vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
+                retriever = vector_store.as_retriever(search_kwargs={"k": 20})
                 cloud_docs = retriever.invoke(query)
                 for doc in cloud_docs:
                     if doc.page_content not in seen_content:
                         final_docs.append(doc)
                         seen_content.add(doc.page_content)
             except Exception as e:
                 print(f"❌ Cloud Error: {e}")
             return final_docs
+    # 2. SETUP LLM
     google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
     os.environ["GOOGLE_API_KEY"] = google_key
     llm = ChatGoogleGenerativeAI(
         model="gemini-2.5-flash",
         temperature=0.3,
         convert_system_message_to_human=True
     )
+    # 3. PROMPT
     template = """
 You are a doctrinal study assistant for William Branham's Message teachings.
 INSTRUCTIONS:
+1.  **Read the Context:** I have provided quotes from the sermons.
+2.  **Sermon Focus:** If the User asks for a summary of a SPECIFIC sermon (e.g., "The First Seal"), focus on the TEACHING of that sermon (the symbols, the meaning, the revelation), not just the introduction or history of it.
+3.  **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
+4.  **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
+5.  **Accuracy:** Stick strictly to what the quotes say.
 CONTEXT:
 {context_str}
     chain = RetrievalQA.from_chain_type(
         llm=llm,
+        chain_type="stuff",
         retriever=SmartRetriever(),
         return_source_documents=True,
         chain_type_kwargs={"prompt": PROMPT, "document_variable_name": "context_str"},