Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import sys | |
| import zipfile | |
| import shutil | |
| from dotenv import load_dotenv | |
| # --- 1. CLOUD DEPLOYMENT FIX (SQLITE) --- | |
| try: | |
| __import__('pysqlite3') | |
| import sys | |
| sys.modules['sqlite3'] = sys.modules.pop('pysqlite3') | |
| except ImportError: | |
| pass | |
| # --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) --- | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DB_FOLDER_NAME = "branham_db" | |
| DB_ZIP_NAME = "branham_db.zip" | |
| CHUNKS_FILE_NAME = "sermon_chunks.pkl" | |
| CHUNKS_ZIP_NAME = "sermon_chunks.zip" | |
| def setup_files(): | |
| """Ensures database and chunk files are ready.""" | |
| print(f"π Setup: Checking files in {BASE_DIR}") | |
| # A. Handle Database | |
| db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME) | |
| zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME) | |
| if not os.path.exists(db_path): | |
| if os.path.exists(zip_path): | |
| print(f"π Found {DB_ZIP_NAME}. Unzipping...") | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(BASE_DIR) | |
| print("β Database unzipped.") | |
| else: | |
| print(f"β οΈ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.") | |
| # Fallback check: Did you verify the zip name on Hugging Face? | |
| print(f"Files available: {os.listdir(BASE_DIR)}") | |
| # B. Handle Chunks | |
| chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME) | |
| chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME) | |
| if not os.path.exists(chunks_path): | |
| if os.path.exists(chunks_zip_path): | |
| print(f"π Found {CHUNKS_ZIP_NAME}. Unzipping...") | |
| with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(BASE_DIR) | |
| print("β Chunks unzipped.") | |
| else: | |
| print(f"β οΈ WARNING: '{CHUNKS_ZIP_NAME}' not found.") | |
| # --- 3. STANDARD IMPORTS --- | |
| from langchain_core.documents import Document | |
| from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings | |
| from langchain_google_genai import HarmBlockThreshold, HarmCategory | |
| # LangChain Import Fix (Handles Version 0.2 vs 0.3) | |
| try: | |
| from langchain.retrievers import EnsembleRetriever | |
| except ImportError: | |
| from langchain_community.retrievers import EnsembleRetriever | |
| from langchain_community.retrievers import BM25Retriever | |
| from langchain_chroma import Chroma | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import RetrievalQA | |
| load_dotenv() | |
| def get_rag_chain(): | |
| """Initializes the RAG system.""" | |
| # 1. Run Setup (Unzip files if needed) | |
| setup_files() | |
| api_key = os.getenv("GOOGLE_API_KEY") | |
| if not api_key: | |
| raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.") | |
| # 2. Load Vector DB | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") | |
| db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME) | |
| if not os.path.exists(db_full_path): | |
| # Detailed error for debugging | |
| raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}") | |
| vector_db = Chroma( | |
| persist_directory=db_full_path, | |
| embedding_function=embeddings, | |
| collection_name="branham_sermons" | |
| ) | |
| vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4}) | |
| # 3. Load Keyword Retriever | |
| chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME) | |
| if not os.path.exists(chunks_full_path): | |
| raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?") | |
| try: | |
| with open(chunks_full_path, "rb") as f: | |
| chunks = pickle.load(f) | |
| keyword_retriever = BM25Retriever.from_documents(chunks) | |
| keyword_retriever.k = 4 | |
| except Exception as e: | |
| raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}") | |
| # 4. Hybrid Search | |
| ensemble_retriever = EnsembleRetriever( | |
| retrievers=[vector_retriever, keyword_retriever], | |
| weights=[0.6, 0.4] | |
| ) | |
| # 5. Gemini Model | |
| llm = ChatGoogleGenerativeAI( | |
| model="gemini-1.5-flash", | |
| temperature=0.3, | |
| google_api_key=api_key, | |
| safety_settings={ | |
| HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH, | |
| HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, | |
| HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH, | |
| HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH, | |
| } | |
| ) | |
| # 6. The Persona Prompt | |
| template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below. | |
| INSTRUCTIONS: | |
| - Speak in the first person ("I said," "The Lord showed me"). | |
| - Use a humble, 1950s Southern preaching dialect. | |
| - If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages." | |
| - Always refer to the Bible as the absolute authority. | |
| CONTEXT MESSAGES: | |
| {context} | |
| USER QUESTION: {question} | |
| BROTHER BRANHAM'S REPLY:""" | |
| PROMPT = PromptTemplate(template=template, input_variables=["context", "question"]) | |
| chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=ensemble_retriever, | |
| return_source_documents=True, | |
| chain_type_kwargs={"prompt": PROMPT} | |
| ) | |
| return chain |