stevafernandes commited on
Commit
a8538b1
·
verified ·
1 Parent(s): 60f48e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -114
app.py CHANGED
@@ -1,56 +1,16 @@
1
  import streamlit as st
2
- from PyPDF2 import PdfReader
3
- from io import BytesIO
4
  import os
5
- import tempfile
6
 
7
- # Updated imports for current LangChain
8
- from langchain_text_splitters import RecursiveCharacterTextSplitter
9
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
10
  from langchain_community.vectorstores import FAISS
11
  from langchain_core.prompts import PromptTemplate
12
  from langchain_core.output_parsers import StrOutputParser
13
- from langchain_core.runnables import RunnablePassthrough
14
 
15
  # --- Configuration ---
16
- # Get API key from Hugging Face Secrets
17
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
18
 
19
- # Path to your pre-uploaded PDF file in the Hugging Face Space
20
- # Place your PDF in the same directory as app.py or specify a subdirectory
21
- PDF_FILE_PATH = "Papal_Encyclicals.pdf" # Change this to your PDF filename
22
-
23
- # Use temporary directory for FAISS index
24
- TEMP_DIR = tempfile.gettempdir()
25
- FAISS_INDEX_PATH = os.path.join(TEMP_DIR, "faiss_index")
26
-
27
-
28
- def get_pdf_text(pdf_path):
29
- """Extract text from a PDF file at the given path."""
30
- text = ""
31
- try:
32
- pdf_reader = PdfReader(pdf_path)
33
- for page in pdf_reader.pages:
34
- page_text = page.extract_text()
35
- if page_text:
36
- text += page_text
37
- except Exception as e:
38
- st.error(f"Error reading PDF: {str(e)}")
39
- return ""
40
- return text
41
-
42
-
43
- def get_text_chunks(text):
44
- """Split text into chunks for processing."""
45
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
46
- return text_splitter.split_text(text)
47
-
48
-
49
- def get_vector_store(text_chunks, api_key):
50
- """Create and save FAISS vector store from text chunks."""
51
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
52
- vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
53
- vector_store.save_local(FAISS_INDEX_PATH)
54
 
55
 
56
  def get_conversational_chain(api_key):
@@ -72,7 +32,7 @@ def get_conversational_chain(api_key):
72
 
73
  Answer (based only on the context above):
74
  """
75
- model = ChatGoogleGenerativeAI(model="gemini-3-flash-preview", temperature=0, google_api_key=api_key)
76
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
77
 
78
  chain = prompt | model | StrOutputParser()
@@ -84,11 +44,9 @@ def format_docs(docs):
84
  return "\n\n".join(doc.page_content for doc in docs)
85
 
86
 
87
- def user_input(user_question, api_key):
88
  """Process user question and return answer from the PDF context."""
89
- embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
90
- new_db = FAISS.load_local(FAISS_INDEX_PATH, embeddings, allow_dangerous_deserialization=True)
91
- docs = new_db.similarity_search(user_question)
92
 
93
  chain = get_conversational_chain(api_key)
94
  context = format_docs(docs)
@@ -97,21 +55,18 @@ def user_input(user_question, api_key):
97
 
98
 
99
  @st.cache_resource
100
- def initialize_vector_store(_api_key):
101
- """Initialize the vector store from the pre-uploaded PDF (cached)."""
102
- if not os.path.exists(PDF_FILE_PATH):
103
- return False, f"PDF file not found at: {PDF_FILE_PATH}"
104
-
105
- try:
106
- raw_text = get_pdf_text(PDF_FILE_PATH)
107
- if not raw_text.strip():
108
- return False, "No text could be extracted from the PDF."
109
-
110
- text_chunks = get_text_chunks(raw_text)
111
- get_vector_store(text_chunks, _api_key)
112
- return True, "PDF processed successfully!"
113
- except Exception as e:
114
- return False, f"Error processing PDF: {str(e)}"
115
 
116
 
117
  def main():
@@ -122,29 +77,25 @@ def main():
122
  initial_sidebar_state="collapsed"
123
  )
124
 
125
- # Custom CSS for clean, professional appearance
126
  st.markdown(
127
  """
128
  <style>
129
- /* Hide Streamlit header, footer, and menu */
130
  #MainMenu {visibility: hidden;}
131
  header {visibility: hidden;}
132
  footer {visibility: hidden;}
133
  .stDeployButton {display: none;}
134
 
135
- /* Remove top padding caused by hidden header */
136
  .block-container {
137
  padding-top: 2rem;
138
  padding-bottom: 2rem;
139
  max-width: 800px;
140
  }
141
 
142
- /* Clean white background */
143
  .stApp {
144
  background-color: #ffffff;
145
  }
146
 
147
- /* Typography */
148
  .main-title {
149
  font-size: 2.5rem;
150
  font-weight: 600;
@@ -161,21 +112,11 @@ def main():
161
  margin-bottom: 2rem;
162
  }
163
 
164
- /* Success message styling */
165
- .stSuccess {
166
- background-color: #f0f9f4;
167
- border: 1px solid #86efac;
168
- border-radius: 8px;
169
- padding: 0.75rem 1rem;
170
- }
171
-
172
- /* Input field styling */
173
  .stTextInput > div > div > input {
174
  border: 1px solid #e0e0e0;
175
  border-radius: 8px;
176
  padding: 0.75rem 1rem;
177
  font-size: 1rem;
178
- transition: border-color 0.2s ease;
179
  }
180
 
181
  .stTextInput > div > div > input:focus {
@@ -183,16 +124,6 @@ def main():
183
  box-shadow: 0 0 0 2px rgba(74, 144, 217, 0.1);
184
  }
185
 
186
- /* Section headers */
187
- .section-header {
188
- font-size: 1.1rem;
189
- font-weight: 500;
190
- color: #333333;
191
- margin-top: 1.5rem;
192
- margin-bottom: 1rem;
193
- }
194
-
195
- /* Answer box styling */
196
  .answer-container {
197
  background-color: #fafafa;
198
  border: 1px solid #e8e8e8;
@@ -216,14 +147,6 @@ def main():
216
  line-height: 1.7;
217
  }
218
 
219
- /* Divider */
220
- hr {
221
- border: none;
222
- border-top: 1px solid #eaeaea;
223
- margin: 1.5rem 0;
224
- }
225
-
226
- /* Status indicator */
227
  .status-badge {
228
  display: inline-flex;
229
  align-items: center;
@@ -244,7 +167,6 @@ def main():
244
  border-radius: 50%;
245
  }
246
 
247
- /* Hide label for cleaner look */
248
  .stTextInput label {
249
  font-size: 0.95rem;
250
  color: #444444;
@@ -261,33 +183,32 @@ def main():
261
  st.markdown('<p class="subtitle">Ask questions about papal encyclicals and get answers based on the source document</p>', unsafe_allow_html=True)
262
 
263
  # Check for API key
264
- api_key = GOOGLE_API_KEY
265
-
266
- if not api_key:
267
  st.error("Google API Key not found in environment variables.")
268
  st.info("Please add GOOGLE_API_KEY to your Hugging Face Space secrets.")
269
  st.stop()
270
 
271
- # Check if PDF exists
272
- if not os.path.exists(PDF_FILE_PATH):
273
- st.error(f"PDF file not found: {PDF_FILE_PATH}")
274
- st.info("Please upload your PDF file to the Hugging Face Space repository.")
 
275
  st.stop()
276
 
277
- # Initialize vector store (cached, runs only once)
278
- with st.spinner("Loading document..."):
279
- success, message = initialize_vector_store(api_key)
280
-
281
- if not success:
282
- st.error(message)
283
- st.stop()
284
 
285
- # Display status badge
286
  st.markdown(
287
- f'''
288
  <div class="status-badge">
289
  <span class="status-dot"></span>
290
- Document loaded: {PDF_FILE_PATH}
291
  </div>
292
  ''',
293
  unsafe_allow_html=True
@@ -304,7 +225,7 @@ def main():
304
  if user_question:
305
  with st.spinner("Searching for answer..."):
306
  try:
307
- answer = user_input(user_question, api_key)
308
  st.markdown(
309
  f'''
310
  <div class="answer-container">
 
1
  import streamlit as st
 
 
2
  import os
 
3
 
 
 
4
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
5
  from langchain_community.vectorstores import FAISS
6
  from langchain_core.prompts import PromptTemplate
7
  from langchain_core.output_parsers import StrOutputParser
 
8
 
9
  # --- Configuration ---
 
10
  GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY", "")
11
 
12
+ # Path to pre-built FAISS index in the repo
13
+ FAISS_INDEX_PATH = "faiss_index"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  def get_conversational_chain(api_key):
 
32
 
33
  Answer (based only on the context above):
34
  """
35
+ model = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0, google_api_key=api_key)
36
  prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
37
 
38
  chain = prompt | model | StrOutputParser()
 
44
  return "\n\n".join(doc.page_content for doc in docs)
45
 
46
 
47
+ def user_input(user_question, vector_store, api_key):
48
  """Process user question and return answer from the PDF context."""
49
+ docs = vector_store.similarity_search(user_question)
 
 
50
 
51
  chain = get_conversational_chain(api_key)
52
  context = format_docs(docs)
 
55
 
56
 
57
  @st.cache_resource
58
+ def load_vector_store(_api_key):
59
+ """Load pre-built FAISS vector store."""
60
+ embeddings = GoogleGenerativeAIEmbeddings(
61
+ model="models/embedding-001",
62
+ google_api_key=_api_key
63
+ )
64
+ vector_store = FAISS.load_local(
65
+ FAISS_INDEX_PATH,
66
+ embeddings,
67
+ allow_dangerous_deserialization=True
68
+ )
69
+ return vector_store
 
 
 
70
 
71
 
72
  def main():
 
77
  initial_sidebar_state="collapsed"
78
  )
79
 
80
+ # Custom CSS
81
  st.markdown(
82
  """
83
  <style>
 
84
  #MainMenu {visibility: hidden;}
85
  header {visibility: hidden;}
86
  footer {visibility: hidden;}
87
  .stDeployButton {display: none;}
88
 
 
89
  .block-container {
90
  padding-top: 2rem;
91
  padding-bottom: 2rem;
92
  max-width: 800px;
93
  }
94
 
 
95
  .stApp {
96
  background-color: #ffffff;
97
  }
98
 
 
99
  .main-title {
100
  font-size: 2.5rem;
101
  font-weight: 600;
 
112
  margin-bottom: 2rem;
113
  }
114
 
 
 
 
 
 
 
 
 
 
115
  .stTextInput > div > div > input {
116
  border: 1px solid #e0e0e0;
117
  border-radius: 8px;
118
  padding: 0.75rem 1rem;
119
  font-size: 1rem;
 
120
  }
121
 
122
  .stTextInput > div > div > input:focus {
 
124
  box-shadow: 0 0 0 2px rgba(74, 144, 217, 0.1);
125
  }
126
 
 
 
 
 
 
 
 
 
 
 
127
  .answer-container {
128
  background-color: #fafafa;
129
  border: 1px solid #e8e8e8;
 
147
  line-height: 1.7;
148
  }
149
 
 
 
 
 
 
 
 
 
150
  .status-badge {
151
  display: inline-flex;
152
  align-items: center;
 
167
  border-radius: 50%;
168
  }
169
 
 
170
  .stTextInput label {
171
  font-size: 0.95rem;
172
  color: #444444;
 
183
  st.markdown('<p class="subtitle">Ask questions about papal encyclicals and get answers based on the source document</p>', unsafe_allow_html=True)
184
 
185
  # Check for API key
186
+ if not GOOGLE_API_KEY:
 
 
187
  st.error("Google API Key not found in environment variables.")
188
  st.info("Please add GOOGLE_API_KEY to your Hugging Face Space secrets.")
189
  st.stop()
190
 
191
+ # Check if FAISS index exists
192
+ index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
193
+ if not os.path.exists(index_file):
194
+ st.error(f"FAISS index not found at: {FAISS_INDEX_PATH}/")
195
+ st.info("Please upload index.faiss and index.pkl to the faiss_index folder.")
196
  st.stop()
197
 
198
+ # Load vector store (cached)
199
+ with st.spinner("Loading index..."):
200
+ try:
201
+ vector_store = load_vector_store(GOOGLE_API_KEY)
202
+ except Exception as e:
203
+ st.error(f"Error loading index: {str(e)}")
204
+ st.stop()
205
 
206
+ # Status badge
207
  st.markdown(
208
+ '''
209
  <div class="status-badge">
210
  <span class="status-dot"></span>
211
+ Document ready
212
  </div>
213
  ''',
214
  unsafe_allow_html=True
 
225
  if user_question:
226
  with st.spinner("Searching for answer..."):
227
  try:
228
+ answer = user_input(user_question, vector_store, GOOGLE_API_KEY)
229
  st.markdown(
230
  f'''
231
  <div class="answer-container">