Adoption commited on
Commit
4fafa21
Β·
verified Β·
1 Parent(s): 25c058b

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +67 -35
src/app.py CHANGED
@@ -29,10 +29,6 @@ else:
29
 
30
  # --- SEARCH ENGINE (PURE LOCAL - NO VECTORS) ---
31
  def search_archives(query):
32
- """
33
- Search Mode: Scans local files strictly.
34
- Returns ALL matches found (up to 1000).
35
- """
36
  status_log = []
37
  results = []
38
 
@@ -44,18 +40,44 @@ def search_archives(query):
44
  status_log.append(f"πŸ“‚ Scanning {len(chunks)} local paragraphs...")
45
  query_lower = query.lower().strip()
46
 
47
- # Find ALL Matches (No Limit)
48
- results = [doc for doc in chunks if query_lower in doc.page_content.lower()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  # Safety Check
51
- total_found = len(results)
52
  if total_found > 1000:
53
- results = results[:1000]
54
- status_log.append(f"⚠️ Found {total_found} matches! Showing first 1000 to prevent crash.")
55
  else:
56
- status_log.append(f"βœ… Found {total_found} exact matches.")
57
 
58
- return results, status_log
59
 
60
  except Exception as e:
61
  status_log.append(f"❌ Local Load Error: {e}")
@@ -64,7 +86,7 @@ def search_archives(query):
64
  status_log.append("❌ Pickle file missing. Cannot search.")
65
  return [], status_log
66
 
67
- # --- RAG CHAIN (The Chat Tool - BIG CONTEXT MODE) ---
68
  def get_rag_chain():
69
 
70
  class SmartRetriever(BaseRetriever):
@@ -75,70 +97,80 @@ def get_rag_chain():
75
  final_docs = []
76
  seen_content = set()
77
 
78
- # --- PHASE A: LOCAL LOOKUP (BM25 - TOP 60) ---
79
  if os.path.exists(CHUNKS_FILE):
80
  try:
81
  with open(CHUNKS_FILE, "rb") as f:
82
  chunks = pickle.load(f)
83
 
84
- # BM25 is better than simple keywords. It finds "First Seal" even if you type "list seals".
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  keyword_retriever = BM25Retriever.from_documents(chunks)
86
- keyword_retriever.k = 60 # GRAB 60 CHUNKS!
87
  local_matches = keyword_retriever.invoke(query)
88
 
89
  for doc in local_matches:
90
  if doc.page_content not in seen_content:
91
  final_docs.append(doc)
92
  seen_content.add(doc.page_content)
 
93
  except Exception as e:
94
  print(f"⚠️ Local Search Warning: {e}")
95
 
96
- # --- PHASE B: CLOUD LOOKUP (TOP 40 - NO FILTERS) ---
97
  print("☁️ Checking Cloud...")
98
  try:
99
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
100
  vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
101
-
102
- # We removed the year filter so it finds everything relevant.
103
- retriever = vector_store.as_retriever(search_kwargs={"k": 40})
104
  cloud_docs = retriever.invoke(query)
105
-
106
  for doc in cloud_docs:
107
  if doc.page_content not in seen_content:
108
  final_docs.append(doc)
109
  seen_content.add(doc.page_content)
110
-
111
- print(f"βœ… Added {len(cloud_docs)} cloud matches.")
112
  except Exception as e:
113
  print(f"❌ Cloud Error: {e}")
114
 
115
- # NO RERANKING. Just send all 100 docs to the AI.
116
  return final_docs
117
 
118
- # 2. SETUP LLM (Gemini 1.5 Pro)
119
  google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
120
  os.environ["GOOGLE_API_KEY"] = google_key
121
 
122
- # "gemini-1.5-pro-latest" has a huge context window. It can handle this load easily.
123
  llm = ChatGoogleGenerativeAI(
124
  model="gemini-2.5-flash",
125
  temperature=0.3,
126
  convert_system_message_to_human=True
127
  )
128
 
129
- # 3. PROMPT (NATURAL & ACCURATE)
130
  template = """
131
  You are a doctrinal study assistant for William Branham's Message teachings.
132
 
133
- Your goal is to answer the user's question by synthesizing the provided CONTEXT into a smooth, easy-to-read explanation.
134
-
135
  INSTRUCTIONS:
136
- 1. **Read the Context:** Look at all the provided quotes (there are many).
137
- 2. **Identify the Answer:** Even if the answer is spread across multiple quotes, piece it together.
138
- 3. **Natural Tone:** Write in normal, comfortable paragraphs.
139
- 4. **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12). Just tell the truth of what is written.
140
- 5. **Accuracy:** Do not add a different meaning. Stick strictly to what the quotes say.
141
- 6. **Correction Logic:** If the context contains a later correction (e.g., from the Seven Seals), prioritize that explanation.
142
 
143
  CONTEXT:
144
  {context_str}
@@ -152,7 +184,7 @@ ANSWER:
152
 
153
  chain = RetrievalQA.from_chain_type(
154
  llm=llm,
155
- chain_type="stuff", # "Stuff" puts all 100 docs in at once.
156
  retriever=SmartRetriever(),
157
  return_source_documents=True,
158
  chain_type_kwargs={"prompt": PROMPT, "document_variable_name": "context_str"},
 
29
 
30
  # --- SEARCH ENGINE (PURE LOCAL - NO VECTORS) ---
31
  def search_archives(query):
 
 
 
 
32
  status_log = []
33
  results = []
34
 
 
40
  status_log.append(f"πŸ“‚ Scanning {len(chunks)} local paragraphs...")
41
  query_lower = query.lower().strip()
42
 
43
+ # STRATEGY 1: FILENAME MATCH (Priority)
44
+ # If query is "First Seal", grab paragraphs from "63-0318 The First Seal.pdf"
45
+ filename_matches = [
46
+ doc for doc in chunks
47
+ if query_lower in doc.metadata.get('source', '').lower()
48
+ ]
49
+
50
+ if filename_matches:
51
+ status_log.append(f"πŸ“Ό Found {len(filename_matches)} chunks from specific Tape(s).")
52
+ results.extend(filename_matches)
53
+
54
+ # STRATEGY 2: CONTENT MATCH
55
+ # Also grab exact text matches
56
+ content_matches = [
57
+ doc for doc in chunks
58
+ if query_lower in doc.page_content.lower()
59
+ ]
60
+ results.extend(content_matches)
61
+
62
+ # Deduplicate
63
+ unique_results = []
64
+ seen_ids = set()
65
+ for doc in results:
66
+ # Create a unique signature for the doc
67
+ sig = doc.page_content[:50]
68
+ if sig not in seen_ids:
69
+ unique_results.append(doc)
70
+ seen_ids.add(sig)
71
 
72
  # Safety Check
73
+ total_found = len(unique_results)
74
  if total_found > 1000:
75
+ unique_results = unique_results[:1000]
76
+ status_log.append(f"⚠️ Found {total_found} matches! Showing first 1000.")
77
  else:
78
+ status_log.append(f"βœ… Found {total_found} unique matches.")
79
 
80
+ return unique_results, status_log
81
 
82
  except Exception as e:
83
  status_log.append(f"❌ Local Load Error: {e}")
 
86
  status_log.append("❌ Pickle file missing. Cannot search.")
87
  return [], status_log
88
 
89
+ # --- RAG CHAIN (The Chat Tool - SERMON AWARE) ---
90
  def get_rag_chain():
91
 
92
  class SmartRetriever(BaseRetriever):
 
97
  final_docs = []
98
  seen_content = set()
99
 
 
100
  if os.path.exists(CHUNKS_FILE):
101
  try:
102
  with open(CHUNKS_FILE, "rb") as f:
103
  chunks = pickle.load(f)
104
 
105
+ query_lower = query.lower()
106
+
107
+ # --- PRIORITY 1: IS IT A SERMON TITLE? ---
108
+ # If the user asks about "The First Seal", we want chunks FROM that tape.
109
+ title_matches = [
110
+ doc for doc in chunks
111
+ if query_lower in doc.metadata.get('source', '').lower()
112
+ ]
113
+
114
+ if title_matches:
115
+ print(f"πŸ“Ό Identified Sermon Title Match! Added {len(title_matches)} chunks from the specific tape.")
116
+ # Add a good spread of chunks from the sermon (up to 40)
117
+ # We take the *middle* chunks usually, as that's where the teaching is.
118
+ # For simplicity, we take the first 40 found.
119
+ for doc in title_matches[:40]:
120
+ if doc.page_content not in seen_content:
121
+ final_docs.append(doc)
122
+ seen_content.add(doc.page_content)
123
+
124
+ # --- PRIORITY 2: BM25 KEYWORD SEARCH ---
125
+ # We still run this to find cross-references in other tapes
126
  keyword_retriever = BM25Retriever.from_documents(chunks)
127
+ keyword_retriever.k = 40
128
  local_matches = keyword_retriever.invoke(query)
129
 
130
  for doc in local_matches:
131
  if doc.page_content not in seen_content:
132
  final_docs.append(doc)
133
  seen_content.add(doc.page_content)
134
+
135
  except Exception as e:
136
  print(f"⚠️ Local Search Warning: {e}")
137
 
138
+ # --- PRIORITY 3: CLOUD LOOKUP ---
139
  print("☁️ Checking Cloud...")
140
  try:
141
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
142
  vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
143
+ retriever = vector_store.as_retriever(search_kwargs={"k": 20})
 
 
144
  cloud_docs = retriever.invoke(query)
 
145
  for doc in cloud_docs:
146
  if doc.page_content not in seen_content:
147
  final_docs.append(doc)
148
  seen_content.add(doc.page_content)
 
 
149
  except Exception as e:
150
  print(f"❌ Cloud Error: {e}")
151
 
 
152
  return final_docs
153
 
154
+ # 2. SETUP LLM
155
  google_key = os.environ.get("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY")
156
  os.environ["GOOGLE_API_KEY"] = google_key
157
 
 
158
  llm = ChatGoogleGenerativeAI(
159
  model="gemini-2.5-flash",
160
  temperature=0.3,
161
  convert_system_message_to_human=True
162
  )
163
 
164
+ # 3. PROMPT
165
  template = """
166
  You are a doctrinal study assistant for William Branham's Message teachings.
167
 
 
 
168
  INSTRUCTIONS:
169
+ 1. **Read the Context:** I have provided quotes from the sermons.
170
+ 2. **Sermon Focus:** If the User asks for a summary of a SPECIFIC sermon (e.g., "The First Seal"), focus on the TEACHING of that sermon (the symbols, the meaning, the revelation), not just the introduction or history of it.
171
+ 3. **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
172
+ 4. **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
173
+ 5. **Accuracy:** Stick strictly to what the quotes say.
 
174
 
175
  CONTEXT:
176
  {context_str}
 
184
 
185
  chain = RetrievalQA.from_chain_type(
186
  llm=llm,
187
+ chain_type="stuff",
188
  retriever=SmartRetriever(),
189
  return_source_documents=True,
190
  chain_type_kwargs={"prompt": PROMPT, "document_variable_name": "context_str"},