Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import zipfile | |
| import sys | |
| import streamlit as st | |
| from dotenv import load_dotenv | |
| # --- IMPORTS --- | |
| from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings | |
| from langchain_community.retrievers import BM25Retriever | |
| from langchain_pinecone import PineconeVectorStore | |
| from langchain_core.prompts import PromptTemplate | |
| from langchain.chains import RetrievalQA | |
| # Robust Import for Hybrid Search (Handles different LangChain versions) | |
| try: | |
| from langchain.retrievers import EnsembleRetriever | |
| except ImportError: | |
| from langchain_community.retrievers import EnsembleRetriever | |
| load_dotenv() | |
| # --- CONFIGURATION --- | |
| INDEX_NAME = "branham-index" | |
| CHUNKS_FILE = "sermon_chunks.pkl" | |
| CHUNKS_ZIP = "sermon_chunks.zip" | |
| def setup_keyword_file(): | |
| """ | |
| Automatic Unzipper. | |
| GitHub has a 100MB limit, so we upload the zip. | |
| This extracts 'sermon_chunks.pkl' when the app starts. | |
| """ | |
| if not os.path.exists(CHUNKS_FILE): | |
| if os.path.exists(CHUNKS_ZIP): | |
| print(f"📦 Unzipping {CHUNKS_ZIP}...") | |
| try: | |
| with zipfile.ZipFile(CHUNKS_ZIP, 'r') as zip_ref: | |
| zip_ref.extractall(".") | |
| print("✅ Unzip complete.") | |
| except Exception as e: | |
| print(f"❌ Error unzipping file: {e}") | |
| else: | |
| print(f"⚠️ Warning: Neither {CHUNKS_FILE} nor {CHUNKS_ZIP} found.") | |
| def get_rag_chain(): | |
| """ | |
| Initializes the Brain of the AI. | |
| 1. Connects to Pinecone (Cloud) | |
| 2. Loads BM25 Keywords (Local) | |
| 3. Merges them into a Hybrid Search | |
| """ | |
| # 1. SETUP & KEYS | |
| setup_keyword_file() | |
| # Check Streamlit Secrets first (Cloud), then .env (Local) | |
| pinecone_key = st.secrets.get("PINECONE_API_KEY") or os.getenv("PINECONE_API_KEY") | |
| google_key = st.secrets.get("GOOGLE_API_KEY") or os.getenv("GOOGLE_API_KEY") | |
| if not pinecone_key or not google_key: | |
| raise ValueError("❌ Missing API Keys. Please set PINECONE_API_KEY and GOOGLE_API_KEY in Secrets.") | |
| # Set keys for LangChain to use automatically | |
| os.environ["PINECONE_API_KEY"] = pinecone_key | |
| os.environ["GOOGLE_API_KEY"] = google_key | |
| # 2. CLOUD VECTOR SEARCH (Pinecone) | |
| # This finds "concepts" (e.g., searching for 'marriage' finds 'wedding') | |
| print("🔌 Connecting to Pinecone...") | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") | |
| vector_store = PineconeVectorStore( | |
| index_name=INDEX_NAME, | |
| embedding=embeddings | |
| ) | |
| vector_retriever = vector_store.as_retriever(search_kwargs={"k": 5}) | |
| # 3. LOCAL KEYWORD SEARCH (BM25) | |
| # This finds "exact matches" (e.g., searching for 'E-53' finds exactly E-53) | |
| print("🔌 Loading Keyword Search...") | |
| keyword_retriever = None | |
| try: | |
| if os.path.exists(CHUNKS_FILE): | |
| with open(CHUNKS_FILE, "rb") as f: | |
| chunks = pickle.load(f) | |
| keyword_retriever = BM25Retriever.from_documents(chunks) | |
| keyword_retriever.k = 5 | |
| else: | |
| print("⚠️ Keyword file missing. Running on Pinecone only.") | |
| except Exception as e: | |
| print(f"❌ Failed to load keyword file: {e}") | |
| # 4. HYBRID RETRIEVER (The Merge) | |
| if keyword_retriever: | |
| print("🔗 Linking Hybrid System...") | |
| final_retriever = EnsembleRetriever( | |
| retrievers=[vector_retriever, keyword_retriever], | |
| weights=[0.7, 0.3] # 70% Vector, 30% Keyword | |
| ) | |
| else: | |
| final_retriever = vector_retriever | |
| # 5. THE MODEL (Gemini) | |
| llm = ChatGoogleGenerativeAI( | |
| model="gemini-1.5-flash", | |
| temperature=0.3, | |
| convert_system_message_to_human=True | |
| ) | |
| # 6. THE PERSONA PROMPT | |
| template = """You are William Marion Branham. | |
| INSTRUCTIONS: | |
| - Answer the user's question based ONLY on the context provided below. | |
| - Speak in the first person ("I said," "The Lord showed me"). | |
| - Use a humble, 1950s Southern preaching dialect. | |
| - If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages." | |
| CONTEXT: | |
| {context} | |
| USER QUESTION: {question} | |
| BROTHER BRANHAM'S REPLY:""" | |
| PROMPT = PromptTemplate(template=template, input_variables=["context", "question"]) | |
| chain = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=final_retriever, | |
| return_source_documents=True, | |
| chain_type_kwargs={"prompt": PROMPT} | |
| ) | |
| return chain | |