Spaces:

Adoption
/

the_seventh_handle

Sleeping

App Files Files Community

Adoption commited on Nov 27, 2025

Commit

538f28d

verified ·

1 Parent(s): d8fce4e

Update src/app.py

Browse files

Files changed (1) hide show

src/app.py +68 -40

src/app.py CHANGED Viewed

@@ -2,9 +2,10 @@ import os
 import pickle
 import sys
 import zipfile
 from dotenv import load_dotenv
-# --- CLOUD FIX ---
 try:
     __import__('pysqlite3')
     import sys
@@ -12,84 +13,111 @@ try:
 except ImportError:
     pass
-# --- STANDARD IMPORTS ---
 from langchain_core.documents import Document
 from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain_google_genai import HarmBlockThreshold, HarmCategory
 from langchain_community.retrievers import BM25Retriever
-from langchain.retrievers import EnsembleRetriever
 from langchain_chroma import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 load_dotenv()
-# --- PATH SETUP ---
-BASE_DIR = os.path.dirname(os.path.abspath(__file__))
-DB_PATH = os.path.join(BASE_DIR, "branham_db")
-CHUNKS_PATH = os.path.join(BASE_DIR, "sermon_chunks.pkl")
-def check_and_unzip():
-    """Unzips files only if they are missing."""
-    # 1. Unzip Database
-    if os.path.exists("branham_db.zip") and not os.path.exists("branham_db"):
-        print("📂 Unzipping Database (branham_db.zip)...")
-        with zipfile.ZipFile("branham_db.zip", 'r') as zip_ref:
-            zip_ref.extractall(".")
-        print("✅ Database unzipped.")
-    # 2. Unzip Chunks
-    if os.path.exists("sermon_chunks.zip") and not os.path.exists("sermon_chunks.pkl"):
-        print("📂 Unzipping Chunks (sermon_chunks.zip)...")
-        with zipfile.ZipFile("sermon_chunks.zip", 'r') as zip_ref:
-            zip_ref.extractall(".")
-        print("✅ Chunks unzipped.")
 def get_rag_chain():
     """Initializes the RAG system."""
-    # --- CRITICAL: RUN UNZIP HERE, NOT AT TOP LEVEL ---
-    check_and_unzip()
-    # --------------------------------------------------
     api_key = os.getenv("GOOGLE_API_KEY")
     if not api_key:
         raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")
-    # 1. Load Vector DB
     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
-    if not os.path.exists(DB_PATH):
-         raise FileNotFoundError(f"Database folder 'branham_db' not found. Unzip failed.")
     vector_db = Chroma(
-        persist_directory=DB_PATH,
         embedding_function=embeddings,
         collection_name="branham_sermons"
     )
     vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})
-    # 2. Load Keyword Retriever
-    if not os.path.exists(CHUNKS_PATH):
-        raise FileNotFoundError(f"File not found: {CHUNKS_PATH}")
     try:
-        with open(CHUNKS_PATH, "rb") as f:
             chunks = pickle.load(f)
         keyword_retriever = BM25Retriever.from_documents(chunks)
         keyword_retriever.k = 4
     except Exception as e:
-        raise RuntimeError(f"Failed to load sermon_chunks.pkl. Error: {e}")
-    # 3. Hybrid Search
     ensemble_retriever = EnsembleRetriever(
         retrievers=[vector_retriever, keyword_retriever],
         weights=[0.6, 0.4]
     )
-    # 4. Gemini Model
     llm = ChatGoogleGenerativeAI(
-        model="gemini-1.5-flash", # Using stable flash for speed
         temperature=0.3,
         google_api_key=api_key,
         safety_settings={
@@ -100,7 +128,7 @@ def get_rag_chain():
         }
     )
-    # 5. The Persona Prompt
     template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.
 INSTRUCTIONS:

 import pickle
 import sys
 import zipfile
+import shutil
 from dotenv import load_dotenv
+# --- 1. CLOUD DEPLOYMENT FIX (SQLITE) ---
 try:
     __import__('pysqlite3')
     import sys
 except ImportError:
     pass
+# --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) ---
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+DB_FOLDER_NAME = "branham_db"
+DB_ZIP_NAME = "branham_db.zip"
+CHUNKS_FILE_NAME = "sermon_chunks.pkl"
+CHUNKS_ZIP_NAME = "sermon_chunks.zip"
+def setup_files():
+    """Ensures database and chunk files are ready."""
+    print(f"📂 Setup: Checking files in {BASE_DIR}")
+    # A. Handle Database
+    db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
+    zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME)
+    if not os.path.exists(db_path):
+        if os.path.exists(zip_path):
+            print(f"🚀 Found {DB_ZIP_NAME}. Unzipping...")
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(BASE_DIR)
+            print("✅ Database unzipped.")
+        else:
+            print(f"⚠️ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.")
+            # Fallback check: Did you verify the zip name on Hugging Face?
+            print(f"Files available: {os.listdir(BASE_DIR)}")
+    # B. Handle Chunks
+    chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
+    chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME)
+    if not os.path.exists(chunks_path):
+        if os.path.exists(chunks_zip_path):
+            print(f"🚀 Found {CHUNKS_ZIP_NAME}. Unzipping...")
+            with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref:
+                zip_ref.extractall(BASE_DIR)
+            print("✅ Chunks unzipped.")
+        else:
+            print(f"⚠️ WARNING: '{CHUNKS_ZIP_NAME}' not found.")
+# --- 3. STANDARD IMPORTS ---
 from langchain_core.documents import Document
 from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
 from langchain_google_genai import HarmBlockThreshold, HarmCategory
+# LangChain Import Fix (Handles Version 0.2 vs 0.3)
+try:
+    from langchain.retrievers import EnsembleRetriever
+except ImportError:
+    from langchain_community.retrievers import EnsembleRetriever
 from langchain_community.retrievers import BM25Retriever
 from langchain_chroma import Chroma
 from langchain.prompts import PromptTemplate
 from langchain.chains import RetrievalQA
 load_dotenv()
 def get_rag_chain():
     """Initializes the RAG system."""
+    # 1. Run Setup (Unzip files if needed)
+    setup_files()
     api_key = os.getenv("GOOGLE_API_KEY")
     if not api_key:
         raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")
+    # 2. Load Vector DB
     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
+    db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
+    if not os.path.exists(db_full_path):
+         # Detailed error for debugging
+         raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}")
     vector_db = Chroma(
+        persist_directory=db_full_path,
         embedding_function=embeddings,
         collection_name="branham_sermons"
     )
     vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})
+    # 3. Load Keyword Retriever
+    chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
+    if not os.path.exists(chunks_full_path):
+        raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?")
     try:
+        with open(chunks_full_path, "rb") as f:
             chunks = pickle.load(f)
         keyword_retriever = BM25Retriever.from_documents(chunks)
         keyword_retriever.k = 4
     except Exception as e:
+        raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}")
+    # 4. Hybrid Search
     ensemble_retriever = EnsembleRetriever(
         retrievers=[vector_retriever, keyword_retriever],
         weights=[0.6, 0.4]
     )
+    # 5. Gemini Model
     llm = ChatGoogleGenerativeAI(
+        model="gemini-1.5-flash",
         temperature=0.3,
         google_api_key=api_key,
         safety_settings={
         }
     )
+    # 6. The Persona Prompt
     template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.
 INSTRUCTIONS: