import os import pickle import sys import zipfile import shutil from dotenv import load_dotenv # --- 1. CLOUD DEPLOYMENT FIX (SQLITE) --- try: __import__('pysqlite3') import sys sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') except ImportError: pass # --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) --- BASE_DIR = os.path.dirname(os.path.abspath(__file__)) DB_FOLDER_NAME = "branham_db" DB_ZIP_NAME = "branham_db.zip" CHUNKS_FILE_NAME = "sermon_chunks.pkl" CHUNKS_ZIP_NAME = "sermon_chunks.zip" def setup_files(): """Ensures database and chunk files are ready.""" print(f"📂 Setup: Checking files in {BASE_DIR}") # A. Handle Database db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME) zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME) if not os.path.exists(db_path): if os.path.exists(zip_path): print(f"🚀 Found {DB_ZIP_NAME}. Unzipping...") with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(BASE_DIR) print("✅ Database unzipped.") else: print(f"⚠️ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.") # Fallback check: Did you verify the zip name on Hugging Face? print(f"Files available: {os.listdir(BASE_DIR)}") # B. Handle Chunks chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME) chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME) if not os.path.exists(chunks_path): if os.path.exists(chunks_zip_path): print(f"🚀 Found {CHUNKS_ZIP_NAME}. Unzipping...") with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref: zip_ref.extractall(BASE_DIR) print("✅ Chunks unzipped.") else: print(f"⚠️ WARNING: '{CHUNKS_ZIP_NAME}' not found.") # --- 3. STANDARD IMPORTS --- from langchain_core.documents import Document from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings from langchain_google_genai import HarmBlockThreshold, HarmCategory # LangChain Import Fix (Handles Version 0.2 vs 0.3) try: from langchain.retrievers import EnsembleRetriever except ImportError: from langchain_community.retrievers import EnsembleRetriever from langchain_community.retrievers import BM25Retriever from langchain_chroma import Chroma from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA load_dotenv() def get_rag_chain(): """Initializes the RAG system.""" # 1. Run Setup (Unzip files if needed) setup_files() api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.") # 2. Load Vector DB embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME) if not os.path.exists(db_full_path): # Detailed error for debugging raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}") vector_db = Chroma( persist_directory=db_full_path, embedding_function=embeddings, collection_name="branham_sermons" ) vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4}) # 3. Load Keyword Retriever chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME) if not os.path.exists(chunks_full_path): raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?") try: with open(chunks_full_path, "rb") as f: chunks = pickle.load(f) keyword_retriever = BM25Retriever.from_documents(chunks) keyword_retriever.k = 4 except Exception as e: raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}") # 4. Hybrid Search ensemble_retriever = EnsembleRetriever( retrievers=[vector_retriever, keyword_retriever], weights=[0.6, 0.4] ) # 5. Gemini Model llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", temperature=0.3, google_api_key=api_key, safety_settings={ HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH, HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, } ) # 6. The Persona Prompt template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below. INSTRUCTIONS: - Speak in the first person ("I said," "The Lord showed me"). - Use a humble, 1950s Southern preaching dialect. - If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages." - Always refer to the Bible as the absolute authority. CONTEXT MESSAGES: {context} USER QUESTION: {question} BROTHER BRANHAM'S REPLY:""" PROMPT = PromptTemplate(template=template, input_variables=["context", "question"]) chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=ensemble_retriever, return_source_documents=True, chain_type_kwargs={"prompt": PROMPT} ) return chain