Files changed (1) hide show
  1. src/app.py +140 -0
src/app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import zipfile
4
+ import sys
5
+ import streamlit as st
6
+ from dotenv import load_dotenv
7
+
8
+ # --- IMPORTS ---
9
+ from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
10
+ from langchain_community.retrievers import BM25Retriever
11
+ from langchain_pinecone import PineconeVectorStore
12
+ from langchain_core.prompts import PromptTemplate
13
+ from langchain.chains import RetrievalQA
14
+
15
+ # Robust Import for Hybrid Search (Handles different LangChain versions)
16
+ try:
17
+     from langchain.retrievers import EnsembleRetriever
18
+ except ImportError:
19
+     from langchain_community.retrievers import EnsembleRetriever
20
+
21
+ load_dotenv()
22
+
23
+ # --- CONFIGURATION ---
24
+ INDEX_NAME = "branham-index"
25
+ CHUNKS_FILE = "sermon_chunks.pkl"
26
+ CHUNKS_ZIP = "sermon_chunks.zip"
27
+
28
+ def setup_keyword_file():
29
+     """
30
+     Automatic Unzipper.
31
+     GitHub has a 100MB limit, so we upload the zip.
32
+     This extracts 'sermon_chunks.pkl' when the app starts.
33
+     """
34
+     if not os.path.exists(CHUNKS_FILE):
35
+         if os.path.exists(CHUNKS_ZIP):
36
+             print(f"📦 Unzipping {CHUNKS_ZIP}...")
37
+             try:
38
+                 with zipfile.ZipFile(CHUNKS_ZIP, 'r') as zip_ref:
39
+                     zip_ref.extractall(".")
40
+                 print("✅ Unzip complete.")
41
+             except Exception as e:
42
+                 print(f"❌ Error unzipping file: {e}")
43
+         else:
44
+             print(f"⚠️ Warning: Neither {CHUNKS_FILE} nor {CHUNKS_ZIP} found.")
45
+
46
+ def get_rag_chain():
47
+     """
48
+     Initializes the Brain of the AI.
49
+     1. Connects to Pinecone (Cloud)
50
+     2. Loads BM25 Keywords (Local)
51
+     3. Merges them into a Hybrid Search
52
+     """
53
+    
54
+     # 1. SETUP & KEYS
55
+     setup_keyword_file()
56
+
57
+     # Check Streamlit Secrets first (Cloud), then .env (Local)
58
+     pinecone_key = st.secrets.get("PINECONE_API_KEY") or os.getenv("PINECONE_API_KEY")
59
+     google_key = st.secrets.get("GOOGLE_API_KEY") or os.getenv("GOOGLE_API_KEY")
60
+
61
+     if not pinecone_key or not google_key:
62
+         raise ValueError("❌ Missing API Keys. Please set PINECONE_API_KEY and GOOGLE_API_KEY in Secrets.")
63
+
64
+     # Set keys for LangChain to use automatically
65
+     os.environ["PINECONE_API_KEY"] = pinecone_key
66
+     os.environ["GOOGLE_API_KEY"] = google_key
67
+
68
+     # 2. CLOUD VECTOR SEARCH (Pinecone)
69
+     # This finds "concepts" (e.g., searching for 'marriage' finds 'wedding')
70
+     print("🔌 Connecting to Pinecone...")
71
+     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
72
+    
73
+     vector_store = PineconeVectorStore(
74
+         index_name=INDEX_NAME,
75
+         embedding=embeddings
76
+     )
77
+     vector_retriever = vector_store.as_retriever(search_kwargs={"k": 5})
78
+
79
+     # 3. LOCAL KEYWORD SEARCH (BM25)
80
+     # This finds "exact matches" (e.g., searching for 'E-53' finds exactly E-53)
81
+     print("🔌 Loading Keyword Search...")
82
+     keyword_retriever = None
83
+    
84
+     try:
85
+         if os.path.exists(CHUNKS_FILE):
86
+             with open(CHUNKS_FILE, "rb") as f:
87
+                 chunks = pickle.load(f)
88
+             keyword_retriever = BM25Retriever.from_documents(chunks)
89
+             keyword_retriever.k = 5
90
+         else:
91
+             print("⚠️ Keyword file missing. Running on Pinecone only.")
92
+     except Exception as e:
93
+         print(f"❌ Failed to load keyword file: {e}")
94
+
95
+     # 4. HYBRID RETRIEVER (The Merge)
96
+     if keyword_retriever:
97
+         print("🔗 Linking Hybrid System...")
98
+         final_retriever = EnsembleRetriever(
99
+             retrievers=[vector_retriever, keyword_retriever],
100
+             weights=[0.7, 0.3] # 70% Vector, 30% Keyword
101
+         )
102
+     else:
103
+         final_retriever = vector_retriever
104
+
105
+     # 5. THE MODEL (Gemini)
106
+     llm = ChatGoogleGenerativeAI(
107
+         model="gemini-1.5-flash",
108
+         temperature=0.3,
109
+         convert_system_message_to_human=True
110
+     )
111
+
112
+     # 6. THE PERSONA PROMPT
113
+     template = """You are William Marion Branham.
114
+
115
+ INSTRUCTIONS:
116
+ - Answer the user's question based ONLY on the context provided below.
117
+ - Speak in the first person ("I said," "The Lord showed me").
118
+ - Use a humble, 1950s Southern preaching dialect.
119
+ - If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
120
+
121
+ CONTEXT:
122
+ {context}
123
+
124
+ USER QUESTION: {question}
125
+
126
+ BROTHER BRANHAM'S REPLY:"""
127
+
128
+     PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
129
+
130
+     chain = RetrievalQA.from_chain_type(
131
+         llm=llm,
132
+         chain_type="stuff",
133
+         retriever=final_retriever,
134
+         return_source_documents=True,
135
+         chain_type_kwargs={"prompt": PROMPT}
136
+     )
137
+    
138
+     return chain
139
+
140
+