Adoption's picture
Update src/app.py
538f28d verified
import os
import pickle
import sys
import zipfile
import shutil
from dotenv import load_dotenv
# --- 1. CLOUD DEPLOYMENT FIX (SQLITE) ---
try:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
except ImportError:
pass
# --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) ---
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DB_FOLDER_NAME = "branham_db"
DB_ZIP_NAME = "branham_db.zip"
CHUNKS_FILE_NAME = "sermon_chunks.pkl"
CHUNKS_ZIP_NAME = "sermon_chunks.zip"
def setup_files():
"""Ensures database and chunk files are ready."""
print(f"πŸ“‚ Setup: Checking files in {BASE_DIR}")
# A. Handle Database
db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME)
if not os.path.exists(db_path):
if os.path.exists(zip_path):
print(f"πŸš€ Found {DB_ZIP_NAME}. Unzipping...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(BASE_DIR)
print("βœ… Database unzipped.")
else:
print(f"⚠️ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.")
# Fallback check: Did you verify the zip name on Hugging Face?
print(f"Files available: {os.listdir(BASE_DIR)}")
# B. Handle Chunks
chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME)
if not os.path.exists(chunks_path):
if os.path.exists(chunks_zip_path):
print(f"πŸš€ Found {CHUNKS_ZIP_NAME}. Unzipping...")
with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref:
zip_ref.extractall(BASE_DIR)
print("βœ… Chunks unzipped.")
else:
print(f"⚠️ WARNING: '{CHUNKS_ZIP_NAME}' not found.")
# --- 3. STANDARD IMPORTS ---
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_google_genai import HarmBlockThreshold, HarmCategory
# LangChain Import Fix (Handles Version 0.2 vs 0.3)
try:
from langchain.retrievers import EnsembleRetriever
except ImportError:
from langchain_community.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
load_dotenv()
def get_rag_chain():
"""Initializes the RAG system."""
# 1. Run Setup (Unzip files if needed)
setup_files()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")
# 2. Load Vector DB
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
if not os.path.exists(db_full_path):
# Detailed error for debugging
raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}")
vector_db = Chroma(
persist_directory=db_full_path,
embedding_function=embeddings,
collection_name="branham_sermons"
)
vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})
# 3. Load Keyword Retriever
chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
if not os.path.exists(chunks_full_path):
raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?")
try:
with open(chunks_full_path, "rb") as f:
chunks = pickle.load(f)
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 4
except Exception as e:
raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}")
# 4. Hybrid Search
ensemble_retriever = EnsembleRetriever(
retrievers=[vector_retriever, keyword_retriever],
weights=[0.6, 0.4]
)
# 5. Gemini Model
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
temperature=0.3,
google_api_key=api_key,
safety_settings={
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
}
)
# 6. The Persona Prompt
template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.
INSTRUCTIONS:
- Speak in the first person ("I said," "The Lord showed me").
- Use a humble, 1950s Southern preaching dialect.
- If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
- Always refer to the Bible as the absolute authority.
CONTEXT MESSAGES:
{context}
USER QUESTION: {question}
BROTHER BRANHAM'S REPLY:"""
PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=ensemble_retriever,
return_source_documents=True,
chain_type_kwargs={"prompt": PROMPT}
)
return chain