Spaces:

Adoption
/

7th_handle

Sleeping

App Files Files Community

Adoption commited on Jan 1

Commit

f7aed17

verified ·

1 Parent(s): f227e8d

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +171 -46

src/app.py CHANGED Viewed

@@ -1,79 +1,204 @@
 import os
-import sys
 import streamlit as st
 from dotenv import load_dotenv
-# --- IMPORTS ---
-from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain_pinecone import PineconeVectorStore
 from langchain_core.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 load_dotenv()
-# --- CONFIGURATION ---
 INDEX_NAME = "branham-index"
-def get_rag_chain():
-    # --- 1. AUTHENTICATION ---
-    # Robust check: Looks for keys in HF Environment Variables first, then Streamlit Secrets
-    pinecone_key = os.environ.get("PINECONE_API_KEY") or st.secrets.get("PINECONE_API_KEY")
-    google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
-    if not pinecone_key or not google_key:
-        raise ValueError("❌ Missing API Keys. Please add PINECONE_API_KEY and GOOGLE_API_KEY to Settings.")
-    # Set env vars for the libraries
-    os.environ["PINECONE_API_KEY"] = pinecone_key
-    os.environ["GOOGLE_API_KEY"] = google_key
-    # --- 2. CLOUD CONNECTION ---
-    # We connect to Pinecone to retrieve both the vectors AND the text content
-    print("🔌 Connecting to Pinecone...")
-    embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
-    vector_store = PineconeVectorStore(
-        index_name=INDEX_NAME,
-        embedding=embeddings
-    )
-    # Retrieve top 5 most relevant paragraphs
-    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
-    # --- 3. MODEL SETUP ---
-    # We use Gemini 1.5 Flash for speed and low cost
     llm = ChatGoogleGenerativeAI(
-        model="gemini-2.5-flash",
-        temperature=0.3, # Low temperature for factual accuracy
         convert_system_message_to_human=True
     )
-    # --- 4. THE PROMPT ---
-    # This instructs the AI how to behave
-    template = """You are William Marion Branham.
 INSTRUCTIONS:
-- Answer the user's question based ONLY on the context provided below.
-- The context comes directly from your sermon transcripts.
-- Speak in the first person ("I said," "The Lord showed me").
-- Use a humble, 1950s Southern preaching dialect.
-- If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
 CONTEXT:
-{context}
-USER QUESTION: {question}
-BROTHER BRANHAM'S REPLY:"""
-    PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
     chain = RetrievalQA.from_chain_type(
         llm=llm,
         chain_type="stuff",
-        retriever=retriever,
         return_source_documents=True,
-        chain_type_kwargs={"prompt": PROMPT}
     )
-    return chain

 import os
+import pickle
 import streamlit as st
+from typing import List
 from dotenv import load_dotenv
+# LangChain Imports
+from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
 from langchain_pinecone import PineconeVectorStore
 from langchain_core.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_community.retrievers import BM25Retriever
 load_dotenv()
+# --- CONFIGURATION (PATH FIX) ---
 INDEX_NAME = "branham-index"
+# 1. Get the directory where THIS file is (src/)
+CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
+# 2. Get the Parent Directory (Root/)
+ROOT_DIR = os.path.dirname(CURRENT_DIR)
+# 3. Look for the file in the Root
+CHUNKS_FILE = os.path.join(ROOT_DIR, "sermon_chunks.pkl")
+# Fallback: If not in root, check current folder (just in case)
+if not os.path.exists(CHUNKS_FILE):
+    CHUNKS_FILE = os.path.join(CURRENT_DIR, "sermon_chunks.pkl")
+# Verify
+if not os.path.exists(CHUNKS_FILE):
+    print(f"⚠️ WARNING: Pickle file not found at: {CHUNKS_FILE}")
+else:
+    print(f"✅ SUCCESS: Pickle file found at: {CHUNKS_FILE}")
+# --- SEARCH ENGINE (SMART MATCHING) ---
+def search_archives(query):
+    """
+    Search Mode: Scans local file.
+    Features: Unlimited results, Exact filename matching.
+    """
+    status_log = []
+    results = []
+    if os.path.exists(CHUNKS_FILE):
+        try:
+            with open(CHUNKS_FILE, "rb") as f:
+                chunks = pickle.load(f)
+            status_log.append(f"📂 Scanning {len(chunks)} local paragraphs...")
+            query_lower = query.lower().strip()
+            # STRATEGY 1: FILENAME MATCH (Ignore Underscores)
+            filename_matches = []
+            for doc in chunks:
+                fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
+                if query_lower in fname_clean:
+                    filename_matches.append(doc)
+            if filename_matches:
+                status_log.append(f"📼 Found {len(filename_matches)} chunks from specific Tape(s).")
+                results.extend(filename_matches)
+            # STRATEGY 2: CONTENT MATCH (Standard)
+            content_matches = [
+                doc for doc in chunks
+                if query_lower in doc.page_content.lower()
+            ]
+            results.extend(content_matches)
+            # Deduplicate
+            unique_results = []
+            seen_ids = set()
+            for doc in results:
+                sig = doc.page_content[:50]
+                if sig not in seen_ids:
+                    unique_results.append(doc)
+                    seen_ids.add(sig)
+            # Safety Check
+            total_found = len(unique_results)
+            if total_found > 1000:
+                unique_results = unique_results[:1000]
+                status_log.append(f"⚠️ Found {total_found} matches! Showing first 1000.")
+            else:
+                status_log.append(f"✅ Found {total_found} unique matches.")
+            return unique_results, status_log
+        except Exception as e:
+            status_log.append(f"❌ Local Load Error: {e}")
+            return [], status_log
+    else:
+        status_log.append("❌ Pickle file missing. Cannot search.")
+        return [], status_log
+# --- RAG CHAIN (SMART RETRIEVER) ---
+def get_rag_chain():
+    class SmartRetriever(BaseRetriever):
+        def _get_relevant_documents(
+            self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None
+        ) -> List[Document]:
+            print(f"🧠 Chat is thinking about: '{query}'")
+            final_docs = []
+            seen_content = set()
+            if os.path.exists(CHUNKS_FILE):
+                try:
+                    with open(CHUNKS_FILE, "rb") as f:
+                        chunks = pickle.load(f)
+                    query_clean = query.lower().strip()
+                    # --- PRIORITY 1: SMART FILENAME MATCH ---
+                    title_matches = []
+                    for doc in chunks:
+                        fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
+                        if query_clean in fname_clean:
+                            title_matches.append(doc)
+                    if title_matches:
+                        print(f"📼 Sermon Title Match! Added {len(title_matches)} chunks.")
+                        # Increase to 80 chunks to get the FULL sermon depth for teaching
+                        for doc in title_matches[:80]:
+                            if doc.page_content not in seen_content:
+                                final_docs.append(doc)
+                                seen_content.add(doc.page_content)
+                    # --- PRIORITY 2: BM25 SEARCH ---
+                    keyword_retriever = BM25Retriever.from_documents(chunks)
+                    keyword_retriever.k = 40
+                    local_matches = keyword_retriever.invoke(query)
+                    for doc in local_matches:
+                        if doc.page_content not in seen_content:
+                            final_docs.append(doc)
+                            seen_content.add(doc.page_content)
+                except Exception as e:
+                    print(f"⚠️ Local Search Warning: {e}")
+            # --- PRIORITY 3: CLOUD ---
+            try:
+                embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
+                vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
+                retriever = vector_store.as_retriever(search_kwargs={"k": 20})
+                cloud_docs = retriever.invoke(query)
+                for doc in cloud_docs:
+                    if doc.page_content not in seen_content:
+                        final_docs.append(doc)
+                        seen_content.add(doc.page_content)
+            except Exception as e:
+                print(f"❌ Cloud Error: {e}")
+            return final_docs
+    # 2. SETUP LLM
+    google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
+    os.environ["GOOGLE_API_KEY"] = google_key
     llm = ChatGoogleGenerativeAI(
+        model="gemini-1.5-pro-latest",
+        temperature=0.3,
         convert_system_message_to_human=True
     )
+    # 3. PROMPT (STRUCTURED STUDY MODE)
+    template = """You are William Marion Branham ai.
 INSTRUCTIONS:
+- Answer as a Teacher and Evangelist.
+- **STRUCTURE IS MANDATORY:** Do not just write paragraphs. Break the answer down into **Key Elements** (e.g., The Symbol, The Identity, The Meaning).
+- Use **Bullet Points** to list specific details found in the text.
+- If the text describes a symbol (like a Horse, Rider, Beast), explicitly define what each represents based on the quotes.
+- Use a humble, 1950s Southern preaching tone, but keep it clear and organized.
+- Prioritize the **1963 Seven Seals** teaching if the topic is about the Seals.
+- IGNORE irrelevant noise (tape gaps, prayer lines).
+- **NO CITATIONS:** Do not use parenthetical numbers like (54).
 CONTEXT:
+{context_str}
+QUESTION: {question}
+ANSWER:
+"""
+    PROMPT = PromptTemplate(template=template, input_variables=["context_str", "question"])
     chain = RetrievalQA.from_chain_type(
         llm=llm,
         chain_type="stuff",
+        retriever=SmartRetriever(),
         return_source_documents=True,
+        chain_type_kwargs={"prompt": PROMPT, "document_variable_name": "context_str"},
+        input_key="question"
     )
+    return chain