Adoption commited on
Commit
f7aed17
Β·
verified Β·
1 Parent(s): f227e8d

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +171 -46
src/app.py CHANGED
@@ -1,79 +1,204 @@
1
  import os
2
- import sys
3
  import streamlit as st
 
4
  from dotenv import load_dotenv
5
 
6
- # --- IMPORTS ---
7
- from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
8
  from langchain_pinecone import PineconeVectorStore
9
  from langchain_core.prompts import PromptTemplate
10
  from langchain.chains import RetrievalQA
 
 
 
 
11
 
12
  load_dotenv()
13
 
14
- # --- CONFIGURATION ---
15
  INDEX_NAME = "branham-index"
16
 
17
- def get_rag_chain():
18
- # --- 1. AUTHENTICATION ---
19
- # Robust check: Looks for keys in HF Environment Variables first, then Streamlit Secrets
20
- pinecone_key = os.environ.get("PINECONE_API_KEY") or st.secrets.get("PINECONE_API_KEY")
21
- google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
22
 
23
- if not pinecone_key or not google_key:
24
- raise ValueError("❌ Missing API Keys. Please add PINECONE_API_KEY and GOOGLE_API_KEY to Settings.")
25
 
26
- # Set env vars for the libraries
27
- os.environ["PINECONE_API_KEY"] = pinecone_key
28
- os.environ["GOOGLE_API_KEY"] = google_key
29
 
30
- # --- 2. CLOUD CONNECTION ---
31
- # We connect to Pinecone to retrieve both the vectors AND the text content
32
- print("πŸ”Œ Connecting to Pinecone...")
33
- embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- vector_store = PineconeVectorStore(
36
- index_name=INDEX_NAME,
37
- embedding=embeddings
38
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- # Retrieve top 5 most relevant paragraphs
41
- retriever = vector_store.as_retriever(search_kwargs={"k": 5})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- # --- 3. MODEL SETUP ---
44
- # We use Gemini 1.5 Flash for speed and low cost
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  llm = ChatGoogleGenerativeAI(
46
- model="gemini-2.5-flash",
47
- temperature=0.3, # Low temperature for factual accuracy
48
  convert_system_message_to_human=True
49
  )
50
 
51
- # --- 4. THE PROMPT ---
52
- # This instructs the AI how to behave
53
- template = """You are William Marion Branham.
54
 
55
  INSTRUCTIONS:
56
- - Answer the user's question based ONLY on the context provided below.
57
- - The context comes directly from your sermon transcripts.
58
- - Speak in the first person ("I said," "The Lord showed me").
59
- - Use a humble, 1950s Southern preaching dialect.
60
- - If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
 
 
 
61
 
62
  CONTEXT:
63
- {context}
64
 
65
- USER QUESTION: {question}
66
-
67
- BROTHER BRANHAM'S REPLY:"""
68
-
69
- PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
70
 
 
 
 
 
 
71
  chain = RetrievalQA.from_chain_type(
72
  llm=llm,
73
  chain_type="stuff",
74
- retriever=retriever,
75
  return_source_documents=True,
76
- chain_type_kwargs={"prompt": PROMPT}
 
77
  )
78
-
79
- return chain
 
1
  import os
2
+ import pickle
3
  import streamlit as st
4
+ from typing import List
5
  from dotenv import load_dotenv
6
 
7
+ # LangChain Imports
8
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
9
  from langchain_pinecone import PineconeVectorStore
10
  from langchain_core.prompts import PromptTemplate
11
  from langchain.chains import RetrievalQA
12
+ from langchain_core.documents import Document
13
+ from langchain_core.retrievers import BaseRetriever
14
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
15
+ from langchain_community.retrievers import BM25Retriever
16
 
17
  load_dotenv()
18
 
19
+ # --- CONFIGURATION (PATH FIX) ---
20
  INDEX_NAME = "branham-index"
21
 
22
+ # 1. Get the directory where THIS file is (src/)
23
+ CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
 
 
 
24
 
25
+ # 2. Get the Parent Directory (Root/)
26
+ ROOT_DIR = os.path.dirname(CURRENT_DIR)
27
 
28
+ # 3. Look for the file in the Root
29
+ CHUNKS_FILE = os.path.join(ROOT_DIR, "sermon_chunks.pkl")
 
30
 
31
+ # Fallback: If not in root, check current folder (just in case)
32
+ if not os.path.exists(CHUNKS_FILE):
33
+ CHUNKS_FILE = os.path.join(CURRENT_DIR, "sermon_chunks.pkl")
34
+
35
+ # Verify
36
+ if not os.path.exists(CHUNKS_FILE):
37
+ print(f"⚠️ WARNING: Pickle file not found at: {CHUNKS_FILE}")
38
+ else:
39
+ print(f"βœ… SUCCESS: Pickle file found at: {CHUNKS_FILE}")
40
+
41
+ # --- SEARCH ENGINE (SMART MATCHING) ---
42
+ def search_archives(query):
43
+ """
44
+ Search Mode: Scans local file.
45
+ Features: Unlimited results, Exact filename matching.
46
+ """
47
+ status_log = []
48
+ results = []
49
 
50
+ if os.path.exists(CHUNKS_FILE):
51
+ try:
52
+ with open(CHUNKS_FILE, "rb") as f:
53
+ chunks = pickle.load(f)
54
+
55
+ status_log.append(f"πŸ“‚ Scanning {len(chunks)} local paragraphs...")
56
+ query_lower = query.lower().strip()
57
+
58
+ # STRATEGY 1: FILENAME MATCH (Ignore Underscores)
59
+ filename_matches = []
60
+ for doc in chunks:
61
+ fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
62
+ if query_lower in fname_clean:
63
+ filename_matches.append(doc)
64
+
65
+ if filename_matches:
66
+ status_log.append(f"πŸ“Ό Found {len(filename_matches)} chunks from specific Tape(s).")
67
+ results.extend(filename_matches)
68
+
69
+ # STRATEGY 2: CONTENT MATCH (Standard)
70
+ content_matches = [
71
+ doc for doc in chunks
72
+ if query_lower in doc.page_content.lower()
73
+ ]
74
+ results.extend(content_matches)
75
+
76
+ # Deduplicate
77
+ unique_results = []
78
+ seen_ids = set()
79
+ for doc in results:
80
+ sig = doc.page_content[:50]
81
+ if sig not in seen_ids:
82
+ unique_results.append(doc)
83
+ seen_ids.add(sig)
84
+
85
+ # Safety Check
86
+ total_found = len(unique_results)
87
+ if total_found > 1000:
88
+ unique_results = unique_results[:1000]
89
+ status_log.append(f"⚠️ Found {total_found} matches! Showing first 1000.")
90
+ else:
91
+ status_log.append(f"βœ… Found {total_found} unique matches.")
92
+
93
+ return unique_results, status_log
94
+
95
+ except Exception as e:
96
+ status_log.append(f"❌ Local Load Error: {e}")
97
+ return [], status_log
98
+ else:
99
+ status_log.append("❌ Pickle file missing. Cannot search.")
100
+ return [], status_log
101
+
102
+ # --- RAG CHAIN (SMART RETRIEVER) ---
103
+ def get_rag_chain():
104
 
105
+ class SmartRetriever(BaseRetriever):
106
+ def _get_relevant_documents(
107
+ self, query: str, *, run_manager: CallbackManagerForRetrieverRun = None
108
+ ) -> List[Document]:
109
+ print(f"🧠 Chat is thinking about: '{query}'")
110
+ final_docs = []
111
+ seen_content = set()
112
+
113
+ if os.path.exists(CHUNKS_FILE):
114
+ try:
115
+ with open(CHUNKS_FILE, "rb") as f:
116
+ chunks = pickle.load(f)
117
+
118
+ query_clean = query.lower().strip()
119
+
120
+ # --- PRIORITY 1: SMART FILENAME MATCH ---
121
+ title_matches = []
122
+ for doc in chunks:
123
+ fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
124
+ if query_clean in fname_clean:
125
+ title_matches.append(doc)
126
+
127
+ if title_matches:
128
+ print(f"πŸ“Ό Sermon Title Match! Added {len(title_matches)} chunks.")
129
+ # Increase to 80 chunks to get the FULL sermon depth for teaching
130
+ for doc in title_matches[:80]:
131
+ if doc.page_content not in seen_content:
132
+ final_docs.append(doc)
133
+ seen_content.add(doc.page_content)
134
 
135
+ # --- PRIORITY 2: BM25 SEARCH ---
136
+ keyword_retriever = BM25Retriever.from_documents(chunks)
137
+ keyword_retriever.k = 40
138
+ local_matches = keyword_retriever.invoke(query)
139
+
140
+ for doc in local_matches:
141
+ if doc.page_content not in seen_content:
142
+ final_docs.append(doc)
143
+ seen_content.add(doc.page_content)
144
+
145
+ except Exception as e:
146
+ print(f"⚠️ Local Search Warning: {e}")
147
+
148
+ # --- PRIORITY 3: CLOUD ---
149
+ try:
150
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
151
+ vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
152
+ retriever = vector_store.as_retriever(search_kwargs={"k": 20})
153
+ cloud_docs = retriever.invoke(query)
154
+ for doc in cloud_docs:
155
+ if doc.page_content not in seen_content:
156
+ final_docs.append(doc)
157
+ seen_content.add(doc.page_content)
158
+ except Exception as e:
159
+ print(f"❌ Cloud Error: {e}")
160
+
161
+ return final_docs
162
+
163
+ # 2. SETUP LLM
164
+ google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
165
+ os.environ["GOOGLE_API_KEY"] = google_key
166
+
167
  llm = ChatGoogleGenerativeAI(
168
+ model="gemini-1.5-pro-latest",
169
+ temperature=0.3,
170
  convert_system_message_to_human=True
171
  )
172
 
173
+ # 3. PROMPT (STRUCTURED STUDY MODE)
174
+ template = """You are William Marion Branham ai.
 
175
 
176
  INSTRUCTIONS:
177
+ - Answer as a Teacher and Evangelist.
178
+ - **STRUCTURE IS MANDATORY:** Do not just write paragraphs. Break the answer down into **Key Elements** (e.g., The Symbol, The Identity, The Meaning).
179
+ - Use **Bullet Points** to list specific details found in the text.
180
+ - If the text describes a symbol (like a Horse, Rider, Beast), explicitly define what each represents based on the quotes.
181
+ - Use a humble, 1950s Southern preaching tone, but keep it clear and organized.
182
+ - Prioritize the **1963 Seven Seals** teaching if the topic is about the Seals.
183
+ - IGNORE irrelevant noise (tape gaps, prayer lines).
184
+ - **NO CITATIONS:** Do not use parenthetical numbers like (54).
185
 
186
  CONTEXT:
187
+ {context_str}
188
 
189
+ QUESTION: {question}
 
 
 
 
190
 
191
+ ANSWER:
192
+ """
193
+
194
+ PROMPT = PromptTemplate(template=template, input_variables=["context_str", "question"])
195
+
196
  chain = RetrievalQA.from_chain_type(
197
  llm=llm,
198
  chain_type="stuff",
199
+ retriever=SmartRetriever(),
200
  return_source_documents=True,
201
+ chain_type_kwargs={"prompt": PROMPT, "document_variable_name": "context_str"},
202
+ input_key="question"
203
  )
204
+ return chain