Spaces:
Sleeping
Sleeping
File size: 5,560 Bytes
71b568d 0e8ff58 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 0e8ff58 538f28d 71b568d 538f28d 71b568d 538f28d 0e8ff58 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 71b568d 538f28d 3918812 71b568d 3918812 71b568d 3918812 71b568d 0e8ff58 593385d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | import os
import pickle
import sys
import zipfile
import shutil
from dotenv import load_dotenv
# --- 1. CLOUD DEPLOYMENT FIX (SQLITE) ---
try:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
except ImportError:
pass
# --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) ---
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DB_FOLDER_NAME = "branham_db"
DB_ZIP_NAME = "branham_db.zip"
CHUNKS_FILE_NAME = "sermon_chunks.pkl"
CHUNKS_ZIP_NAME = "sermon_chunks.zip"
def setup_files():
"""Ensures database and chunk files are ready."""
print(f"π Setup: Checking files in {BASE_DIR}")
# A. Handle Database
db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME)
if not os.path.exists(db_path):
if os.path.exists(zip_path):
print(f"π Found {DB_ZIP_NAME}. Unzipping...")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(BASE_DIR)
print("β
Database unzipped.")
else:
print(f"β οΈ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.")
# Fallback check: Did you verify the zip name on Hugging Face?
print(f"Files available: {os.listdir(BASE_DIR)}")
# B. Handle Chunks
chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME)
if not os.path.exists(chunks_path):
if os.path.exists(chunks_zip_path):
print(f"π Found {CHUNKS_ZIP_NAME}. Unzipping...")
with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref:
zip_ref.extractall(BASE_DIR)
print("β
Chunks unzipped.")
else:
print(f"β οΈ WARNING: '{CHUNKS_ZIP_NAME}' not found.")
# --- 3. STANDARD IMPORTS ---
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_google_genai import HarmBlockThreshold, HarmCategory
# LangChain Import Fix (Handles Version 0.2 vs 0.3)
try:
from langchain.retrievers import EnsembleRetriever
except ImportError:
from langchain_community.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
load_dotenv()
def get_rag_chain():
"""Initializes the RAG system."""
# 1. Run Setup (Unzip files if needed)
setup_files()
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")
# 2. Load Vector DB
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
if not os.path.exists(db_full_path):
# Detailed error for debugging
raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}")
vector_db = Chroma(
persist_directory=db_full_path,
embedding_function=embeddings,
collection_name="branham_sermons"
)
vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})
# 3. Load Keyword Retriever
chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
if not os.path.exists(chunks_full_path):
raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?")
try:
with open(chunks_full_path, "rb") as f:
chunks = pickle.load(f)
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 4
except Exception as e:
raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}")
# 4. Hybrid Search
ensemble_retriever = EnsembleRetriever(
retrievers=[vector_retriever, keyword_retriever],
weights=[0.6, 0.4]
)
# 5. Gemini Model
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
temperature=0.3,
google_api_key=api_key,
safety_settings={
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
}
)
# 6. The Persona Prompt
template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.
INSTRUCTIONS:
- Speak in the first person ("I said," "The Lord showed me").
- Use a humble, 1950s Southern preaching dialect.
- If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
- Always refer to the Bible as the absolute authority.
CONTEXT MESSAGES:
{context}
USER QUESTION: {question}
BROTHER BRANHAM'S REPLY:"""
PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=ensemble_retriever,
return_source_documents=True,
chain_type_kwargs={"prompt": PROMPT}
)
return chain |