Adoption commited on
Commit
73a13e0
ยท
verified ยท
1 Parent(s): 4fafa21

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +31 -31
src/app.py CHANGED
@@ -21,13 +21,12 @@ INDEX_NAME = "branham-index"
21
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
22
  CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
23
 
24
- # Verify Local File
25
  if not os.path.exists(CHUNKS_FILE):
26
  print(f"โš ๏ธ WARNING: Pickle file not found at: {CHUNKS_FILE}")
27
  else:
28
  print(f"โœ… SUCCESS: Pickle file found at: {CHUNKS_FILE}")
29
 
30
- # --- SEARCH ENGINE (PURE LOCAL - NO VECTORS) ---
31
  def search_archives(query):
32
  status_log = []
33
  results = []
@@ -38,21 +37,25 @@ def search_archives(query):
38
  chunks = pickle.load(f)
39
 
40
  status_log.append(f"๐Ÿ“‚ Scanning {len(chunks)} local paragraphs...")
41
- query_lower = query.lower().strip()
42
 
43
- # STRATEGY 1: FILENAME MATCH (Priority)
44
- # If query is "First Seal", grab paragraphs from "63-0318 The First Seal.pdf"
45
- filename_matches = [
46
- doc for doc in chunks
47
- if query_lower in doc.metadata.get('source', '').lower()
48
- ]
49
 
 
 
 
 
 
 
 
 
 
 
50
  if filename_matches:
51
  status_log.append(f"๐Ÿ“ผ Found {len(filename_matches)} chunks from specific Tape(s).")
52
  results.extend(filename_matches)
53
 
54
- # STRATEGY 2: CONTENT MATCH
55
- # Also grab exact text matches
56
  content_matches = [
57
  doc for doc in chunks
58
  if query_lower in doc.page_content.lower()
@@ -63,7 +66,6 @@ def search_archives(query):
63
  unique_results = []
64
  seen_ids = set()
65
  for doc in results:
66
- # Create a unique signature for the doc
67
  sig = doc.page_content[:50]
68
  if sig not in seen_ids:
69
  unique_results.append(doc)
@@ -86,7 +88,7 @@ def search_archives(query):
86
  status_log.append("โŒ Pickle file missing. Cannot search.")
87
  return [], status_log
88
 
89
- # --- RAG CHAIN (The Chat Tool - SERMON AWARE) ---
90
  def get_rag_chain():
91
 
92
  class SmartRetriever(BaseRetriever):
@@ -102,27 +104,26 @@ def get_rag_chain():
102
  with open(CHUNKS_FILE, "rb") as f:
103
  chunks = pickle.load(f)
104
 
105
- query_lower = query.lower()
106
-
107
- # --- PRIORITY 1: IS IT A SERMON TITLE? ---
108
- # If the user asks about "The First Seal", we want chunks FROM that tape.
109
- title_matches = [
110
- doc for doc in chunks
111
- if query_lower in doc.metadata.get('source', '').lower()
112
- ]
 
113
 
114
  if title_matches:
115
- print(f"๐Ÿ“ผ Identified Sermon Title Match! Added {len(title_matches)} chunks from the specific tape.")
116
- # Add a good spread of chunks from the sermon (up to 40)
117
- # We take the *middle* chunks usually, as that's where the teaching is.
118
- # For simplicity, we take the first 40 found.
119
- for doc in title_matches[:40]:
120
  if doc.page_content not in seen_content:
121
  final_docs.append(doc)
122
  seen_content.add(doc.page_content)
123
 
124
- # --- PRIORITY 2: BM25 KEYWORD SEARCH ---
125
- # We still run this to find cross-references in other tapes
126
  keyword_retriever = BM25Retriever.from_documents(chunks)
127
  keyword_retriever.k = 40
128
  local_matches = keyword_retriever.invoke(query)
@@ -135,8 +136,7 @@ def get_rag_chain():
135
  except Exception as e:
136
  print(f"โš ๏ธ Local Search Warning: {e}")
137
 
138
- # --- PRIORITY 3: CLOUD LOOKUP ---
139
- print("โ˜๏ธ Checking Cloud...")
140
  try:
141
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
142
  vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
@@ -167,7 +167,7 @@ You are a doctrinal study assistant for William Branham's Message teachings.
167
 
168
  INSTRUCTIONS:
169
  1. **Read the Context:** I have provided quotes from the sermons.
170
- 2. **Sermon Focus:** If the User asks for a summary of a SPECIFIC sermon (e.g., "The First Seal"), focus on the TEACHING of that sermon (the symbols, the meaning, the revelation), not just the introduction or history of it.
171
  3. **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
172
  4. **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
173
  5. **Accuracy:** Stick strictly to what the quotes say.
 
21
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
22
  CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
23
 
 
24
  if not os.path.exists(CHUNKS_FILE):
25
  print(f"โš ๏ธ WARNING: Pickle file not found at: {CHUNKS_FILE}")
26
  else:
27
  print(f"โœ… SUCCESS: Pickle file found at: {CHUNKS_FILE}")
28
 
29
+ # --- SEARCH ENGINE (SMART MATCHING) ---
30
  def search_archives(query):
31
  status_log = []
32
  results = []
 
37
  chunks = pickle.load(f)
38
 
39
  status_log.append(f"๐Ÿ“‚ Scanning {len(chunks)} local paragraphs...")
 
40
 
41
+ # NORMALIZE QUERY: "The First Seal" -> "the first seal"
42
+ query_lower = query.lower().strip()
 
 
 
 
43
 
44
+ # STRATEGY 1: FILENAME MATCH (Ignore Underscores)
45
+ filename_matches = []
46
+ for doc in chunks:
47
+ # Get filename, lowercase it, replace underscores with spaces
48
+ # "63_0318_The_First_Seal.pdf" -> "63 0318 the first seal pdf"
49
+ fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
50
+
51
+ if query_lower in fname_clean:
52
+ filename_matches.append(doc)
53
+
54
  if filename_matches:
55
  status_log.append(f"๐Ÿ“ผ Found {len(filename_matches)} chunks from specific Tape(s).")
56
  results.extend(filename_matches)
57
 
58
+ # STRATEGY 2: CONTENT MATCH (Standard)
 
59
  content_matches = [
60
  doc for doc in chunks
61
  if query_lower in doc.page_content.lower()
 
66
  unique_results = []
67
  seen_ids = set()
68
  for doc in results:
 
69
  sig = doc.page_content[:50]
70
  if sig not in seen_ids:
71
  unique_results.append(doc)
 
88
  status_log.append("โŒ Pickle file missing. Cannot search.")
89
  return [], status_log
90
 
91
+ # --- RAG CHAIN (SMART RETRIEVER) ---
92
  def get_rag_chain():
93
 
94
  class SmartRetriever(BaseRetriever):
 
104
  with open(CHUNKS_FILE, "rb") as f:
105
  chunks = pickle.load(f)
106
 
107
+ query_clean = query.lower().strip()
108
+
109
+ # --- PRIORITY 1: SMART FILENAME MATCH ---
110
+ # Replaces underscores so "First Seal" matches "First_Seal"
111
+ title_matches = []
112
+ for doc in chunks:
113
+ fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
114
+ if query_clean in fname_clean:
115
+ title_matches.append(doc)
116
 
117
  if title_matches:
118
+ print(f"๐Ÿ“ผ Sermon Title Match! Added {len(title_matches)} chunks.")
119
+ # If the user asked for a specific tape, GIVE THEM THE TAPE.
120
+ # We add up to 60 chunks from that specific tape to ensure the AI reads the whole thing.
121
+ for doc in title_matches[:60]:
 
122
  if doc.page_content not in seen_content:
123
  final_docs.append(doc)
124
  seen_content.add(doc.page_content)
125
 
126
+ # --- PRIORITY 2: BM25 SEARCH ---
 
127
  keyword_retriever = BM25Retriever.from_documents(chunks)
128
  keyword_retriever.k = 40
129
  local_matches = keyword_retriever.invoke(query)
 
136
  except Exception as e:
137
  print(f"โš ๏ธ Local Search Warning: {e}")
138
 
139
+ # --- PRIORITY 3: CLOUD ---
 
140
  try:
141
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
142
  vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
 
167
 
168
  INSTRUCTIONS:
169
  1. **Read the Context:** I have provided quotes from the sermons.
170
+ 2. **Sermon Focus:** If the quotes come from a SPECIFIC sermon the user asked about (e.g., "The First Seal"), summarize the MAIN TEACHING of that sermon (the symbols, the revelation), not just the intro.
171
  3. **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
172
  4. **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
173
  5. **Accuracy:** Stick strictly to what the quotes say.