Spaces:
Sleeping
Sleeping
Update src/app.py
Browse files- src/app.py +31 -31
src/app.py
CHANGED
|
@@ -21,13 +21,12 @@ INDEX_NAME = "branham-index"
|
|
| 21 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 22 |
CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
|
| 23 |
|
| 24 |
-
# Verify Local File
|
| 25 |
if not os.path.exists(CHUNKS_FILE):
|
| 26 |
print(f"โ ๏ธ WARNING: Pickle file not found at: {CHUNKS_FILE}")
|
| 27 |
else:
|
| 28 |
print(f"โ
SUCCESS: Pickle file found at: {CHUNKS_FILE}")
|
| 29 |
|
| 30 |
-
# --- SEARCH ENGINE (
|
| 31 |
def search_archives(query):
|
| 32 |
status_log = []
|
| 33 |
results = []
|
|
@@ -38,21 +37,25 @@ def search_archives(query):
|
|
| 38 |
chunks = pickle.load(f)
|
| 39 |
|
| 40 |
status_log.append(f"๐ Scanning {len(chunks)} local paragraphs...")
|
| 41 |
-
query_lower = query.lower().strip()
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
filename_matches = [
|
| 46 |
-
doc for doc in chunks
|
| 47 |
-
if query_lower in doc.metadata.get('source', '').lower()
|
| 48 |
-
]
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
if filename_matches:
|
| 51 |
status_log.append(f"๐ผ Found {len(filename_matches)} chunks from specific Tape(s).")
|
| 52 |
results.extend(filename_matches)
|
| 53 |
|
| 54 |
-
# STRATEGY 2: CONTENT MATCH
|
| 55 |
-
# Also grab exact text matches
|
| 56 |
content_matches = [
|
| 57 |
doc for doc in chunks
|
| 58 |
if query_lower in doc.page_content.lower()
|
|
@@ -63,7 +66,6 @@ def search_archives(query):
|
|
| 63 |
unique_results = []
|
| 64 |
seen_ids = set()
|
| 65 |
for doc in results:
|
| 66 |
-
# Create a unique signature for the doc
|
| 67 |
sig = doc.page_content[:50]
|
| 68 |
if sig not in seen_ids:
|
| 69 |
unique_results.append(doc)
|
|
@@ -86,7 +88,7 @@ def search_archives(query):
|
|
| 86 |
status_log.append("โ Pickle file missing. Cannot search.")
|
| 87 |
return [], status_log
|
| 88 |
|
| 89 |
-
# --- RAG CHAIN (
|
| 90 |
def get_rag_chain():
|
| 91 |
|
| 92 |
class SmartRetriever(BaseRetriever):
|
|
@@ -102,27 +104,26 @@ def get_rag_chain():
|
|
| 102 |
with open(CHUNKS_FILE, "rb") as f:
|
| 103 |
chunks = pickle.load(f)
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
# --- PRIORITY 1:
|
| 108 |
-
#
|
| 109 |
-
title_matches = [
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
if title_matches:
|
| 115 |
-
print(f"๐ผ
|
| 116 |
-
#
|
| 117 |
-
# We
|
| 118 |
-
|
| 119 |
-
for doc in title_matches[:40]:
|
| 120 |
if doc.page_content not in seen_content:
|
| 121 |
final_docs.append(doc)
|
| 122 |
seen_content.add(doc.page_content)
|
| 123 |
|
| 124 |
-
# --- PRIORITY 2: BM25
|
| 125 |
-
# We still run this to find cross-references in other tapes
|
| 126 |
keyword_retriever = BM25Retriever.from_documents(chunks)
|
| 127 |
keyword_retriever.k = 40
|
| 128 |
local_matches = keyword_retriever.invoke(query)
|
|
@@ -135,8 +136,7 @@ def get_rag_chain():
|
|
| 135 |
except Exception as e:
|
| 136 |
print(f"โ ๏ธ Local Search Warning: {e}")
|
| 137 |
|
| 138 |
-
# --- PRIORITY 3: CLOUD
|
| 139 |
-
print("โ๏ธ Checking Cloud...")
|
| 140 |
try:
|
| 141 |
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
|
| 142 |
vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
|
|
@@ -167,7 +167,7 @@ You are a doctrinal study assistant for William Branham's Message teachings.
|
|
| 167 |
|
| 168 |
INSTRUCTIONS:
|
| 169 |
1. **Read the Context:** I have provided quotes from the sermons.
|
| 170 |
-
2. **Sermon Focus:** If the
|
| 171 |
3. **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
|
| 172 |
4. **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
|
| 173 |
5. **Accuracy:** Stick strictly to what the quotes say.
|
|
|
|
| 21 |
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 22 |
CHUNKS_FILE = os.path.join(BASE_DIR, "sermon_chunks.pkl")
|
| 23 |
|
|
|
|
| 24 |
if not os.path.exists(CHUNKS_FILE):
|
| 25 |
print(f"โ ๏ธ WARNING: Pickle file not found at: {CHUNKS_FILE}")
|
| 26 |
else:
|
| 27 |
print(f"โ
SUCCESS: Pickle file found at: {CHUNKS_FILE}")
|
| 28 |
|
| 29 |
+
# --- SEARCH ENGINE (SMART MATCHING) ---
|
| 30 |
def search_archives(query):
|
| 31 |
status_log = []
|
| 32 |
results = []
|
|
|
|
| 37 |
chunks = pickle.load(f)
|
| 38 |
|
| 39 |
status_log.append(f"๐ Scanning {len(chunks)} local paragraphs...")
|
|
|
|
| 40 |
|
| 41 |
+
# NORMALIZE QUERY: "The First Seal" -> "the first seal"
|
| 42 |
+
query_lower = query.lower().strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
# STRATEGY 1: FILENAME MATCH (Ignore Underscores)
|
| 45 |
+
filename_matches = []
|
| 46 |
+
for doc in chunks:
|
| 47 |
+
# Get filename, lowercase it, replace underscores with spaces
|
| 48 |
+
# "63_0318_The_First_Seal.pdf" -> "63 0318 the first seal pdf"
|
| 49 |
+
fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
|
| 50 |
+
|
| 51 |
+
if query_lower in fname_clean:
|
| 52 |
+
filename_matches.append(doc)
|
| 53 |
+
|
| 54 |
if filename_matches:
|
| 55 |
status_log.append(f"๐ผ Found {len(filename_matches)} chunks from specific Tape(s).")
|
| 56 |
results.extend(filename_matches)
|
| 57 |
|
| 58 |
+
# STRATEGY 2: CONTENT MATCH (Standard)
|
|
|
|
| 59 |
content_matches = [
|
| 60 |
doc for doc in chunks
|
| 61 |
if query_lower in doc.page_content.lower()
|
|
|
|
| 66 |
unique_results = []
|
| 67 |
seen_ids = set()
|
| 68 |
for doc in results:
|
|
|
|
| 69 |
sig = doc.page_content[:50]
|
| 70 |
if sig not in seen_ids:
|
| 71 |
unique_results.append(doc)
|
|
|
|
| 88 |
status_log.append("โ Pickle file missing. Cannot search.")
|
| 89 |
return [], status_log
|
| 90 |
|
| 91 |
+
# --- RAG CHAIN (SMART RETRIEVER) ---
|
| 92 |
def get_rag_chain():
|
| 93 |
|
| 94 |
class SmartRetriever(BaseRetriever):
|
|
|
|
| 104 |
with open(CHUNKS_FILE, "rb") as f:
|
| 105 |
chunks = pickle.load(f)
|
| 106 |
|
| 107 |
+
query_clean = query.lower().strip()
|
| 108 |
+
|
| 109 |
+
# --- PRIORITY 1: SMART FILENAME MATCH ---
|
| 110 |
+
# Replaces underscores so "First Seal" matches "First_Seal"
|
| 111 |
+
title_matches = []
|
| 112 |
+
for doc in chunks:
|
| 113 |
+
fname_clean = doc.metadata.get('source', '').lower().replace('_', ' ').replace('-', ' ')
|
| 114 |
+
if query_clean in fname_clean:
|
| 115 |
+
title_matches.append(doc)
|
| 116 |
|
| 117 |
if title_matches:
|
| 118 |
+
print(f"๐ผ Sermon Title Match! Added {len(title_matches)} chunks.")
|
| 119 |
+
# If the user asked for a specific tape, GIVE THEM THE TAPE.
|
| 120 |
+
# We add up to 60 chunks from that specific tape to ensure the AI reads the whole thing.
|
| 121 |
+
for doc in title_matches[:60]:
|
|
|
|
| 122 |
if doc.page_content not in seen_content:
|
| 123 |
final_docs.append(doc)
|
| 124 |
seen_content.add(doc.page_content)
|
| 125 |
|
| 126 |
+
# --- PRIORITY 2: BM25 SEARCH ---
|
|
|
|
| 127 |
keyword_retriever = BM25Retriever.from_documents(chunks)
|
| 128 |
keyword_retriever.k = 40
|
| 129 |
local_matches = keyword_retriever.invoke(query)
|
|
|
|
| 136 |
except Exception as e:
|
| 137 |
print(f"โ ๏ธ Local Search Warning: {e}")
|
| 138 |
|
| 139 |
+
# --- PRIORITY 3: CLOUD ---
|
|
|
|
| 140 |
try:
|
| 141 |
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
|
| 142 |
vector_store = PineconeVectorStore(index_name=INDEX_NAME, embedding=embeddings)
|
|
|
|
| 167 |
|
| 168 |
INSTRUCTIONS:
|
| 169 |
1. **Read the Context:** I have provided quotes from the sermons.
|
| 170 |
+
2. **Sermon Focus:** If the quotes come from a SPECIFIC sermon the user asked about (e.g., "The First Seal"), summarize the MAIN TEACHING of that sermon (the symbols, the revelation), not just the intro.
|
| 171 |
3. **Synthesis:** Combine the information into a smooth, easy-to-read explanation.
|
| 172 |
4. **NO CITATIONS:** Do NOT use parenthetical citations like (54, 12).
|
| 173 |
5. **Accuracy:** Stick strictly to what the quotes say.
|