Adoption commited on
Commit
538f28d
ยท
verified ยท
1 Parent(s): d8fce4e

Update src/app.py

Browse files
Files changed (1) hide show
  1. src/app.py +68 -40
src/app.py CHANGED
@@ -2,9 +2,10 @@ import os
2
  import pickle
3
  import sys
4
  import zipfile
 
5
  from dotenv import load_dotenv
6
 
7
- # --- CLOUD FIX ---
8
  try:
9
  __import__('pysqlite3')
10
  import sys
@@ -12,84 +13,111 @@ try:
12
  except ImportError:
13
  pass
14
 
15
- # --- STANDARD IMPORTS ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from langchain_core.documents import Document
17
  from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
18
  from langchain_google_genai import HarmBlockThreshold, HarmCategory
 
 
 
 
 
 
 
19
  from langchain_community.retrievers import BM25Retriever
20
- from langchain.retrievers import EnsembleRetriever
21
  from langchain_chroma import Chroma
22
  from langchain.prompts import PromptTemplate
23
  from langchain.chains import RetrievalQA
24
 
25
  load_dotenv()
26
 
27
- # --- PATH SETUP ---
28
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
29
- DB_PATH = os.path.join(BASE_DIR, "branham_db")
30
- CHUNKS_PATH = os.path.join(BASE_DIR, "sermon_chunks.pkl")
31
-
32
- def check_and_unzip():
33
- """Unzips files only if they are missing."""
34
- # 1. Unzip Database
35
- if os.path.exists("branham_db.zip") and not os.path.exists("branham_db"):
36
- print("๐Ÿ“‚ Unzipping Database (branham_db.zip)...")
37
- with zipfile.ZipFile("branham_db.zip", 'r') as zip_ref:
38
- zip_ref.extractall(".")
39
- print("โœ… Database unzipped.")
40
-
41
- # 2. Unzip Chunks
42
- if os.path.exists("sermon_chunks.zip") and not os.path.exists("sermon_chunks.pkl"):
43
- print("๐Ÿ“‚ Unzipping Chunks (sermon_chunks.zip)...")
44
- with zipfile.ZipFile("sermon_chunks.zip", 'r') as zip_ref:
45
- zip_ref.extractall(".")
46
- print("โœ… Chunks unzipped.")
47
-
48
  def get_rag_chain():
49
  """Initializes the RAG system."""
50
 
51
- # --- CRITICAL: RUN UNZIP HERE, NOT AT TOP LEVEL ---
52
- check_and_unzip()
53
- # --------------------------------------------------
54
 
55
  api_key = os.getenv("GOOGLE_API_KEY")
56
  if not api_key:
57
  raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")
58
 
59
- # 1. Load Vector DB
60
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
 
61
 
62
- if not os.path.exists(DB_PATH):
63
- raise FileNotFoundError(f"Database folder 'branham_db' not found. Unzip failed.")
 
64
 
65
  vector_db = Chroma(
66
- persist_directory=DB_PATH,
67
  embedding_function=embeddings,
68
  collection_name="branham_sermons"
69
  )
70
  vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})
71
 
72
- # 2. Load Keyword Retriever
73
- if not os.path.exists(CHUNKS_PATH):
74
- raise FileNotFoundError(f"File not found: {CHUNKS_PATH}")
 
 
75
 
76
  try:
77
- with open(CHUNKS_PATH, "rb") as f:
78
  chunks = pickle.load(f)
79
  keyword_retriever = BM25Retriever.from_documents(chunks)
80
  keyword_retriever.k = 4
81
  except Exception as e:
82
- raise RuntimeError(f"Failed to load sermon_chunks.pkl. Error: {e}")
83
 
84
- # 3. Hybrid Search
85
  ensemble_retriever = EnsembleRetriever(
86
  retrievers=[vector_retriever, keyword_retriever],
87
  weights=[0.6, 0.4]
88
  )
89
 
90
- # 4. Gemini Model
91
  llm = ChatGoogleGenerativeAI(
92
- model="gemini-1.5-flash", # Using stable flash for speed
93
  temperature=0.3,
94
  google_api_key=api_key,
95
  safety_settings={
@@ -100,7 +128,7 @@ def get_rag_chain():
100
  }
101
  )
102
 
103
- # 5. The Persona Prompt
104
  template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.
105
 
106
  INSTRUCTIONS:
 
2
  import pickle
3
  import sys
4
  import zipfile
5
+ import shutil
6
  from dotenv import load_dotenv
7
 
8
+ # --- 1. CLOUD DEPLOYMENT FIX (SQLITE) ---
9
  try:
10
  __import__('pysqlite3')
11
  import sys
 
13
  except ImportError:
14
  pass
15
 
16
+ # --- 2. ROBUST UNZIPPER (Runs inside get_rag_chain) ---
17
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
18
+ DB_FOLDER_NAME = "branham_db"
19
+ DB_ZIP_NAME = "branham_db.zip"
20
+ CHUNKS_FILE_NAME = "sermon_chunks.pkl"
21
+ CHUNKS_ZIP_NAME = "sermon_chunks.zip"
22
+
23
+ def setup_files():
24
+ """Ensures database and chunk files are ready."""
25
+ print(f"๐Ÿ“‚ Setup: Checking files in {BASE_DIR}")
26
+
27
+ # A. Handle Database
28
+ db_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
29
+ zip_path = os.path.join(BASE_DIR, DB_ZIP_NAME)
30
+
31
+ if not os.path.exists(db_path):
32
+ if os.path.exists(zip_path):
33
+ print(f"๐Ÿš€ Found {DB_ZIP_NAME}. Unzipping...")
34
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
35
+ zip_ref.extractall(BASE_DIR)
36
+ print("โœ… Database unzipped.")
37
+ else:
38
+ print(f"โš ๏ธ WARNING: Neither '{DB_FOLDER_NAME}' folder nor '{DB_ZIP_NAME}' found.")
39
+ # Fallback check: Did you verify the zip name on Hugging Face?
40
+ print(f"Files available: {os.listdir(BASE_DIR)}")
41
+
42
+ # B. Handle Chunks
43
+ chunks_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
44
+ chunks_zip_path = os.path.join(BASE_DIR, CHUNKS_ZIP_NAME)
45
+
46
+ if not os.path.exists(chunks_path):
47
+ if os.path.exists(chunks_zip_path):
48
+ print(f"๐Ÿš€ Found {CHUNKS_ZIP_NAME}. Unzipping...")
49
+ with zipfile.ZipFile(chunks_zip_path, 'r') as zip_ref:
50
+ zip_ref.extractall(BASE_DIR)
51
+ print("โœ… Chunks unzipped.")
52
+ else:
53
+ print(f"โš ๏ธ WARNING: '{CHUNKS_ZIP_NAME}' not found.")
54
+
55
+ # --- 3. STANDARD IMPORTS ---
56
  from langchain_core.documents import Document
57
  from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
58
  from langchain_google_genai import HarmBlockThreshold, HarmCategory
59
+
60
+ # LangChain Import Fix (Handles Version 0.2 vs 0.3)
61
+ try:
62
+ from langchain.retrievers import EnsembleRetriever
63
+ except ImportError:
64
+ from langchain_community.retrievers import EnsembleRetriever
65
+
66
  from langchain_community.retrievers import BM25Retriever
 
67
  from langchain_chroma import Chroma
68
  from langchain.prompts import PromptTemplate
69
  from langchain.chains import RetrievalQA
70
 
71
  load_dotenv()
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  def get_rag_chain():
74
  """Initializes the RAG system."""
75
 
76
+ # 1. Run Setup (Unzip files if needed)
77
+ setup_files()
 
78
 
79
  api_key = os.getenv("GOOGLE_API_KEY")
80
  if not api_key:
81
  raise ValueError("GOOGLE_API_KEY missing. Please set it in Settings > Secrets.")
82
 
83
+ # 2. Load Vector DB
84
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
85
+ db_full_path = os.path.join(BASE_DIR, DB_FOLDER_NAME)
86
 
87
+ if not os.path.exists(db_full_path):
88
+ # Detailed error for debugging
89
+ raise FileNotFoundError(f"Database folder '{DB_FOLDER_NAME}' not found. Zip extraction might have failed or created a nested folder. Files in root: {os.listdir(BASE_DIR)}")
90
 
91
  vector_db = Chroma(
92
+ persist_directory=db_full_path,
93
  embedding_function=embeddings,
94
  collection_name="branham_sermons"
95
  )
96
  vector_retriever = vector_db.as_retriever(search_kwargs={"k": 4})
97
 
98
+ # 3. Load Keyword Retriever
99
+ chunks_full_path = os.path.join(BASE_DIR, CHUNKS_FILE_NAME)
100
+
101
+ if not os.path.exists(chunks_full_path):
102
+ raise FileNotFoundError(f"File not found: {CHUNKS_FILE_NAME}. Did '{CHUNKS_ZIP_NAME}' unzip correctly?")
103
 
104
  try:
105
+ with open(chunks_full_path, "rb") as f:
106
  chunks = pickle.load(f)
107
  keyword_retriever = BM25Retriever.from_documents(chunks)
108
  keyword_retriever.k = 4
109
  except Exception as e:
110
+ raise RuntimeError(f"Failed to load {CHUNKS_FILE_NAME}. Error: {e}")
111
 
112
+ # 4. Hybrid Search
113
  ensemble_retriever = EnsembleRetriever(
114
  retrievers=[vector_retriever, keyword_retriever],
115
  weights=[0.6, 0.4]
116
  )
117
 
118
+ # 5. Gemini Model
119
  llm = ChatGoogleGenerativeAI(
120
+ model="gemini-1.5-flash",
121
  temperature=0.3,
122
  google_api_key=api_key,
123
  safety_settings={
 
128
  }
129
  )
130
 
131
+ # 6. The Persona Prompt
132
  template = """You are William Marion Branham. You are answering a question based ONLY on the sermon excerpts provided below.
133
 
134
  INSTRUCTIONS: