File size: 5,560 Bytes
71b568d
 
 
0e8ff58
538f28d
71b568d
 
538f28d
71b568d
 
 
 
 
 
 
538f28d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71b568d
 
 
538f28d
 
 
 
 
 
 
71b568d
 
 
 
 
 
 
 
 
0e8ff58
538f28d
 
71b568d
 
 
 
 
538f28d
71b568d
538f28d
0e8ff58
538f28d
 
 
71b568d
 
538f28d
71b568d
 
 
 
 
538f28d
 
 
 
 
71b568d
 
538f28d
71b568d
 
 
 
538f28d
71b568d
538f28d
71b568d
 
 
 
 
538f28d
71b568d
538f28d
71b568d
 
 
 
 
 
 
 
 
 
538f28d
3918812
71b568d
 
3918812
71b568d
 
3918812
71b568d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e8ff58
593385d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import pickle
import sys
import zipfile
import shutil
from dotenv import load_dotenv

# --- 1. CLOUD DEPLOYMENT FIX (SQLITE) ---
try:
    __import__('pysqlite3')
    import sys
    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
except ImportError:
    pass

# --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) ---
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DB_FOLDER_NAME = "branham_db"
DB_ZIP_NAME = "branham_db.zip"
CHUNKS_FILE_NAME = "sermon_chunks.pkl"
CHUNKS_ZIP_NAME = "sermon_chunks.zip"

def setup_files():
    """Ensures database and chunk files are ready."""
    print(f"πŸ“‚ Setup: Checking files in {BASE_DIR}")
    
    # A. Handle Database
    db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
    zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME)
    
    if not os.path.exists(db_path):
        if os.path.exists(zip_path):
            print(f"πŸš€ Found {DB_ZIP_NAME}. Unzipping...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(BASE_DIR)
            print("βœ… Database unzipped.")
        else:
            print(f"⚠️ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.")
            # Fallback check: Did you verify the zip name on Hugging Face?
            print(f"Files available: {os.listdir(BASE_DIR)}")

    # B. Handle Chunks
    chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
    chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME)
    
    if not os.path.exists(chunks_path):
        if os.path.exists(chunks_zip_path):
            print(f"πŸš€ Found {CHUNKS_ZIP_NAME}. Unzipping...")
            with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref:
                zip_ref.extractall(BASE_DIR)
            print("βœ… Chunks unzipped.")
        else:
            print(f"⚠️ WARNING: '{CHUNKS_ZIP_NAME}' not found.")

# --- 3. STANDARD IMPORTS ---
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_google_genai import HarmBlockThreshold, HarmCategory

# LangChain Import Fix (Handles Version 0.2 vs 0.3)
try:
    from langchain.retrievers import EnsembleRetriever
except ImportError:
    from langchain_community.retrievers import EnsembleRetriever

from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

load_dotenv()

def get_rag_chain():
    """Initializes the RAG system."""
    
    # 1. Run Setup (Unzip files if needed)
    setup_files()

    api_key = os.getenv("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")

    # 2. Load Vector DB
    embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
    db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
    
    if not os.path.exists(db_full_path):
         # Detailed error for debugging
         raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}")

    vector_db = Chroma(
        persist_directory=db_full_path, 
        embedding_function=embeddings,
        collection_name="branham_sermons"
    )
    vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})

    # 3. Load Keyword Retriever
    chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
    
    if not os.path.exists(chunks_full_path):
        raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?")

    try:
        with open(chunks_full_path, "rb") as f:
            chunks = pickle.load(f)
        keyword_retriever = BM25Retriever.from_documents(chunks)
        keyword_retriever.k = 4
    except Exception as e:
        raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}")

    # 4. Hybrid Search
    ensemble_retriever = EnsembleRetriever(
        retrievers=[vector_retriever, keyword_retriever],
        weights=[0.6, 0.4]
    )

    # 5. Gemini Model
    llm = ChatGoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0.3,
        google_api_key=api_key,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
        }
    )

    # 6. The Persona Prompt
    template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.

INSTRUCTIONS:
- Speak in the first person ("I said," "The Lord showed me").
- Use a humble, 1950s Southern preaching dialect.
- If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
- Always refer to the Bible as the absolute authority.

CONTEXT MESSAGES:
{context}

USER QUESTION: {question}

BROTHER BRANHAM'S REPLY:"""

    PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=ensemble_retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
    
    return chain