Spaces:

Adoption
/

the_seventh_handle

Sleeping

App Files Files Community

the_seventh_handle / src /app.py

Adoption

Update src/app.py

538f28d verified 2 months ago

raw

history blame contribute delete

5.56 kB

	import os
	import pickle
	import sys
	import zipfile
	import shutil
	from dotenv import load_dotenv

	# --- 1. CLOUD DEPLOYMENT FIX (SQLITE) ---
	try:
	__import__('pysqlite3')
	import sys
	sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
	except ImportError:
	pass

	# --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) ---
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DB_FOLDER_NAME = "branham_db"
	DB_ZIP_NAME = "branham_db.zip"
	CHUNKS_FILE_NAME = "sermon_chunks.pkl"
	CHUNKS_ZIP_NAME = "sermon_chunks.zip"

	def setup_files():
	"""Ensures database and chunk files are ready."""
	print(f"📂 Setup: Checking files in {BASE_DIR}")

	# A. Handle Database
	db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
	zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME)

	if not os.path.exists(db_path):
	if os.path.exists(zip_path):
	print(f"🚀 Found {DB_ZIP_NAME}. Unzipping...")
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	zip_ref.extractall(BASE_DIR)
	print("✅ Database unzipped.")
	else:
	print(f"⚠️ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.")
	# Fallback check: Did you verify the zip name on Hugging Face?
	print(f"Files available: {os.listdir(BASE_DIR)}")

	# B. Handle Chunks
	chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
	chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME)

	if not os.path.exists(chunks_path):
	if os.path.exists(chunks_zip_path):
	print(f"🚀 Found {CHUNKS_ZIP_NAME}. Unzipping...")
	with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref:
	zip_ref.extractall(BASE_DIR)
	print("✅ Chunks unzipped.")
	else:
	print(f"⚠️ WARNING: '{CHUNKS_ZIP_NAME}' not found.")

	# --- 3. STANDARD IMPORTS ---
	from langchain_core.documents import Document
	from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
	from langchain_google_genai import HarmBlockThreshold, HarmCategory

	# LangChain Import Fix (Handles Version 0.2 vs 0.3)
	try:
	from langchain.retrievers import EnsembleRetriever
	except ImportError:
	from langchain_community.retrievers import EnsembleRetriever

	from langchain_community.retrievers import BM25Retriever
	from langchain_chroma import Chroma
	from langchain.prompts import PromptTemplate
	from langchain.chains import RetrievalQA

	load_dotenv()

	def get_rag_chain():
	"""Initializes the RAG system."""

	# 1. Run Setup (Unzip files if needed)
	setup_files()

	api_key = os.getenv("GOOGLE_API_KEY")
	if not api_key:
	raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")

	# 2. Load Vector DB
	embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
	db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)

	if not os.path.exists(db_full_path):
	# Detailed error for debugging
	raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}")

	vector_db = Chroma(
	persist_directory=db_full_path,
	embedding_function=embeddings,
	collection_name="branham_sermons"
	)
	vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})

	# 3. Load Keyword Retriever
	chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)

	if not os.path.exists(chunks_full_path):
	raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?")

	try:
	with open(chunks_full_path, "rb") as f:
	chunks = pickle.load(f)
	keyword_retriever = BM25Retriever.from_documents(chunks)
	keyword_retriever.k = 4
	except Exception as e:
	raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}")

	# 4. Hybrid Search
	ensemble_retriever = EnsembleRetriever(
	retrievers=[vector_retriever, keyword_retriever],
	weights=[0.6, 0.4]
	)

	# 5. Gemini Model
	llm = ChatGoogleGenerativeAI(
	model="gemini-1.5-flash",
	temperature=0.3,
	google_api_key=api_key,
	safety_settings={
	HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
	HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
	HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
	HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
	}
	)

	# 6. The Persona Prompt
	template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.

	INSTRUCTIONS:
	- Speak in the first person ("I said," "The Lord showed me").
	- Use a humble, 1950s Southern preaching dialect.
	- If the answer is not in the text, say: "Brother, I don't recall preaching specifically on that detail in these messages."
	- Always refer to the Bible as the absolute authority.

	CONTEXT MESSAGES:
	{context}

	USER QUESTION: {question}

	BROTHER BRANHAM'S REPLY:"""

	PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

	chain = RetrievalQA.from_chain_type(
	llm=llm,
	chain_type="stuff",
	retriever=ensemble_retriever,
	return_source_documents=True,
	chain_type_kwargs={"prompt": PROMPT}
	)

	return chain