msmaje commited on
Commit
08b18a5
Β·
verified Β·
1 Parent(s): c28a344

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +455 -0
app.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import shutil
5
+ from pathlib import Path
6
+ import logging
7
+ import zipfile
8
+
9
+ # Set up logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ try:
14
+ from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain_community.embeddings import HuggingFaceEmbeddings
17
+ from langchain_community.vectorstores import FAISS
18
+ from langchain.prompts import PromptTemplate
19
+ from langchain.chains import RetrievalQA
20
+ from langchain_community.llms import HuggingFaceHub
21
+ LANGCHAIN_AVAILABLE = True
22
+ except ImportError as e:
23
+ logger.error(f"LangChain import error: {e}")
24
+ LANGCHAIN_AVAILABLE = False
25
+
26
+ # Global variables for the RAG system
27
+ vectorstore = None
28
+ retrieval_qa = None
29
+ embedding_model = None
30
+
31
+ # Check for pre-existing PDF folder
32
+ PDF_FOLDER_PATH = "./pdfs" # Default folder for PDFs in the space
33
+ PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
34
+
35
+ def initialize_models():
36
+ """Initialize the embedding model and LLM"""
37
+ global embedding_model
38
+
39
+ try:
40
+ # Initialize embedding model
41
+ embedding_model = HuggingFaceEmbeddings(
42
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
43
+ model_kwargs={'device': 'cpu'}
44
+ )
45
+
46
+ # Get HuggingFace token from environment
47
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
48
+ if not hf_token:
49
+ return False, "❌ HuggingFace API token not found in environment variables"
50
+
51
+ # Initialize LLM
52
+ llm = HuggingFaceHub(
53
+ repo_id="microsoft/DialoGPT-medium",
54
+ model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
55
+ huggingfacehub_api_token=hf_token
56
+ )
57
+
58
+ return True, "βœ… Models initialized successfully"
59
+
60
+ except Exception as e:
61
+ logger.error(f"Model initialization error: {e}")
62
+ return False, f"❌ Error initializing models: {str(e)}"
63
+
64
+ def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
65
+ """Load PDFs from the pre-existing folder"""
66
+ global vectorstore, retrieval_qa, embedding_model
67
+
68
+ if not LANGCHAIN_AVAILABLE:
69
+ return "❌ LangChain is not available. Please check the installation."
70
+
71
+ if not PRELOADED_PDFS:
72
+ return "❌ No pre-loaded PDFs found in ./pdfs folder."
73
+
74
+ try:
75
+ # Initialize models if not already done
76
+ if embedding_model is None:
77
+ success, message = initialize_models()
78
+ if not success:
79
+ return message
80
+
81
+ # Load documents from pre-existing folder
82
+ loader = PyPDFDirectoryLoader(PDF_FOLDER_PATH)
83
+ documents = loader.load()
84
+
85
+ if not documents:
86
+ return "❌ No documents were loaded from the PDFs folder."
87
+
88
+ # Split documents into chunks
89
+ text_splitter = RecursiveCharacterTextSplitter(
90
+ chunk_size=int(chunk_size),
91
+ chunk_overlap=int(chunk_overlap)
92
+ )
93
+ chunks = text_splitter.split_documents(documents)
94
+
95
+ # Create vector store
96
+ vectorstore = FAISS.from_documents(chunks, embedding_model)
97
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
98
+
99
+ # Setup prompt template
100
+ prompt_template = """
101
+ Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
102
+
103
+ Context:
104
+ {context}
105
+
106
+ Question: {question}
107
+
108
+ Helpful Answer:
109
+ """
110
+ prompt = PromptTemplate(
111
+ input_variables=["context", "question"],
112
+ template=prompt_template
113
+ )
114
+
115
+ # Initialize LLM
116
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
117
+ llm = HuggingFaceHub(
118
+ repo_id="google/flan-t5-base",
119
+ model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
120
+ huggingfacehub_api_token=hf_token
121
+ )
122
+
123
+ # Create RetrievalQA chain
124
+ retrieval_qa = RetrievalQA.from_chain_type(
125
+ llm=llm,
126
+ chain_type="stuff",
127
+ retriever=retriever,
128
+ return_source_documents=True,
129
+ chain_type_kwargs={"prompt": prompt}
130
+ )
131
+
132
+ pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
133
+ return f"βœ… Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
134
+
135
+ except Exception as e:
136
+ logger.error(f"Pre-loaded PDF processing error: {e}")
137
+ return f"❌ Error processing pre-loaded PDFs: {str(e)}"
138
+
139
+ def extract_zip_to_pdfs(zip_file):
140
+ """Extract uploaded ZIP file to PDFs folder"""
141
+ if not zip_file:
142
+ return "❌ Please upload a ZIP file."
143
+
144
+ try:
145
+ # Create PDFs directory if it doesn't exist
146
+ os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
147
+
148
+ # Extract ZIP file
149
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
150
+ # Extract only PDF files
151
+ pdf_files = [f for f in zip_ref.namelist() if f.lower().endswith('.pdf')]
152
+
153
+ if not pdf_files:
154
+ return "❌ No PDF files found in the ZIP archive."
155
+
156
+ for pdf_file in pdf_files:
157
+ # Extract to PDFs folder
158
+ zip_ref.extract(pdf_file, PDF_FOLDER_PATH)
159
+
160
+ # If file is in a subfolder, move it to the root of PDFs folder
161
+ extracted_path = os.path.join(PDF_FOLDER_PATH, pdf_file)
162
+ if os.path.dirname(pdf_file): # File is in a subfolder
163
+ new_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
164
+ shutil.move(extracted_path, new_path)
165
+ # Clean up empty directories
166
+ try:
167
+ os.rmdir(os.path.dirname(extracted_path))
168
+ except:
169
+ pass
170
+
171
+ global PRELOADED_PDFS
172
+ PRELOADED_PDFS = True
173
+
174
+ return f"βœ… Successfully extracted {len(pdf_files)} PDF files. Now click 'Load Pre-existing PDFs' to process them."
175
+
176
+ except Exception as e:
177
+ return f"❌ Error extracting ZIP file: {str(e)}"
178
+ def process_pdfs(pdf_files, chunk_size, chunk_overlap):
179
+ """Process uploaded PDF files and create vector store"""
180
+ global vectorstore, retrieval_qa, embedding_model
181
+
182
+ if not LANGCHAIN_AVAILABLE:
183
+ return "❌ LangChain is not available. Please check the installation."
184
+
185
+ if not pdf_files:
186
+ return "❌ Please upload at least one PDF file or use pre-loaded PDFs."
187
+
188
+ try:
189
+ # Initialize models if not already done
190
+ if embedding_model is None:
191
+ success, message = initialize_models()
192
+ if not success:
193
+ return message
194
+
195
+ # Create temporary directory for PDFs
196
+ temp_dir = tempfile.mkdtemp()
197
+
198
+ # Save uploaded files to temp directory
199
+ for pdf_file in pdf_files:
200
+ if pdf_file is not None:
201
+ temp_path = os.path.join(temp_dir, os.path.basename(pdf_file.name))
202
+ shutil.copy2(pdf_file.name, temp_path)
203
+
204
+ # Load documents
205
+ loader = PyPDFDirectoryLoader(temp_dir)
206
+ documents = loader.load()
207
+
208
+ if not documents:
209
+ return "❌ No documents were loaded. Please check your PDF files."
210
+
211
+ # Split documents into chunks
212
+ text_splitter = RecursiveCharacterTextSplitter(
213
+ chunk_size=int(chunk_size),
214
+ chunk_overlap=int(chunk_overlap)
215
+ )
216
+ chunks = text_splitter.split_documents(documents)
217
+
218
+ # Create vector store
219
+ vectorstore = FAISS.from_documents(chunks, embedding_model)
220
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
221
+
222
+ # Setup prompt template
223
+ prompt_template = """
224
+ Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
225
+
226
+ Context:
227
+ {context}
228
+
229
+ Question: {question}
230
+
231
+ Helpful Answer:
232
+ """
233
+ prompt = PromptTemplate(
234
+ input_variables=["context", "question"],
235
+ template=prompt_template
236
+ )
237
+
238
+ # Initialize LLM
239
+ hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
240
+ llm = HuggingFaceHub(
241
+ repo_id="google/flan-t5-base",
242
+ model_kwargs={"temperature": 0.7, "max_new_tokens": 512},
243
+ huggingfacehub_api_token=hf_token
244
+ )
245
+
246
+ # Create RetrievalQA chain
247
+ retrieval_qa = RetrievalQA.from_chain_type(
248
+ llm=llm,
249
+ chain_type="stuff",
250
+ retriever=retriever,
251
+ return_source_documents=True,
252
+ chain_type_kwargs={"prompt": prompt}
253
+ )
254
+
255
+ # Clean up temp directory
256
+ shutil.rmtree(temp_dir)
257
+
258
+ return f"βœ… Successfully processed {len(documents)} documents into {len(chunks)} chunks. Ready for questions!"
259
+
260
+ except Exception as e:
261
+ logger.error(f"PDF processing error: {e}")
262
+ return f"❌ Error processing PDFs: {str(e)}"
263
+
264
+ def answer_question(question):
265
+ """Answer a question using the RAG system"""
266
+ global retrieval_qa
267
+
268
+ if not question.strip():
269
+ return "❌ Please enter a question.", ""
270
+
271
+ if retrieval_qa is None:
272
+ return "❌ Please upload and process PDF files first.", ""
273
+
274
+ try:
275
+ # Get answer from RAG system
276
+ result = retrieval_qa({"query": question})
277
+
278
+ answer = result["result"]
279
+
280
+ # Format source documents
281
+ sources = []
282
+ for i, doc in enumerate(result.get("source_documents", []), 1):
283
+ source = doc.metadata.get("source", "Unknown")
284
+ page = doc.metadata.get("page", "Unknown")
285
+ content_preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
286
+
287
+ sources.append(f"**Source {i}:**\n- File: {Path(source).name}\n- Page: {page}\n- Preview: {content_preview}\n")
288
+
289
+ sources_text = "\n".join(sources) if sources else "No sources found."
290
+
291
+ return answer, sources_text
292
+
293
+ except Exception as e:
294
+ logger.error(f"Question answering error: {e}")
295
+ return f"❌ Error answering question: {str(e)}", ""
296
+
297
+ def create_interface():
298
+ """Create the Gradio interface"""
299
+
300
+ with gr.Blocks(title="PDF RAG System", theme=gr.themes.Soft()) as demo:
301
+ gr.Markdown("""
302
+ # πŸ“š PDF Question Answering System
303
+
304
+ Upload your PDF documents and ask questions about their content!
305
+
306
+ **Instructions:**
307
+ 1. **Option A**: Upload individual PDF files and click "Process PDFs"
308
+ 2. **Option B**: Upload a ZIP file containing PDFs and extract them
309
+ 3. **Option C**: Use pre-loaded PDFs (if available in ./pdfs folder)
310
+ 4. Ask questions about your documents
311
+ """)
312
+
313
+ # Check for pre-loaded PDFs
314
+ if PRELOADED_PDFS:
315
+ gr.Markdown("πŸŽ‰ **Pre-loaded PDFs detected!** You can use the 'Load Pre-existing PDFs' button.")
316
+
317
+ with gr.Row():
318
+ with gr.Column(scale=1):
319
+ gr.Markdown("### πŸ“„ Upload & Settings")
320
+
321
+ with gr.Tabs():
322
+ with gr.TabItem("πŸ“ Individual PDFs"):
323
+ pdf_files = gr.File(
324
+ label="Upload PDF Files",
325
+ file_count="multiple",
326
+ file_types=[".pdf"],
327
+ height=150
328
+ )
329
+ process_btn = gr.Button("πŸ”„ Process PDFs", variant="primary")
330
+
331
+ with gr.TabItem("πŸ—‚οΈ ZIP Upload"):
332
+ zip_file = gr.File(
333
+ label="Upload ZIP File (containing PDFs)",
334
+ file_count="single",
335
+ file_types=[".zip"],
336
+ height=100
337
+ )
338
+ extract_btn = gr.Button("πŸ“¦ Extract ZIP to PDFs Folder", variant="secondary")
339
+ extract_output = gr.Textbox(label="Extraction Status", lines=2)
340
+
341
+ with gr.TabItem("πŸ’Ύ Pre-loaded"):
342
+ if PRELOADED_PDFS:
343
+ pdf_list = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
344
+ gr.Markdown(f"**Found {len(pdf_list)} PDF files:**")
345
+ for pdf in pdf_list[:10]: # Show first 10
346
+ gr.Markdown(f"- {pdf}")
347
+ if len(pdf_list) > 10:
348
+ gr.Markdown(f"... and {len(pdf_list) - 10} more files")
349
+ else:
350
+ gr.Markdown("No pre-loaded PDFs found. Place PDF files in `./pdfs/` folder.")
351
+
352
+ preload_btn = gr.Button("πŸ“š Load Pre-existing PDFs", variant="primary",
353
+ interactive=PRELOADED_PDFS)
354
+
355
+ with gr.Row():
356
+ chunk_size = gr.Slider(
357
+ minimum=200,
358
+ maximum=2000,
359
+ value=1000,
360
+ step=100,
361
+ label="Chunk Size"
362
+ )
363
+
364
+ chunk_overlap = gr.Slider(
365
+ minimum=0,
366
+ maximum=500,
367
+ value=200,
368
+ step=50,
369
+ label="Chunk Overlap"
370
+ )
371
+
372
+ process_output = gr.Textbox(label="Processing Status", lines=4)
373
+
374
+ with gr.Column(scale=2):
375
+ gr.Markdown("### ❓ Ask Questions")
376
+
377
+ question_input = gr.Textbox(
378
+ label="Your Question",
379
+ placeholder="What would you like to know about your documents?",
380
+ lines=2
381
+ )
382
+
383
+ ask_btn = gr.Button("πŸ€” Ask Question", variant="secondary")
384
+
385
+ with gr.Row():
386
+ with gr.Column():
387
+ answer_output = gr.Textbox(
388
+ label="Answer",
389
+ lines=8,
390
+ max_lines=15
391
+ )
392
+
393
+ with gr.Column():
394
+ sources_output = gr.Textbox(
395
+ label="Sources",
396
+ lines=8,
397
+ max_lines=15
398
+ )
399
+
400
+ # Event handlers
401
+ process_btn.click(
402
+ fn=process_pdfs,
403
+ inputs=[pdf_files, chunk_size, chunk_overlap],
404
+ outputs=[process_output]
405
+ )
406
+
407
+ preload_btn.click(
408
+ fn=load_preloaded_pdfs,
409
+ inputs=[chunk_size, chunk_overlap],
410
+ outputs=[process_output]
411
+ )
412
+
413
+ extract_btn.click(
414
+ fn=extract_zip_to_pdfs,
415
+ inputs=[zip_file],
416
+ outputs=[extract_output]
417
+ )
418
+
419
+ ask_btn.click(
420
+ fn=answer_question,
421
+ inputs=[question_input],
422
+ outputs=[answer_output, sources_output]
423
+ )
424
+
425
+ question_input.submit(
426
+ fn=answer_question,
427
+ inputs=[question_input],
428
+ outputs=[answer_output, sources_output]
429
+ )
430
+
431
+ # Example questions
432
+ gr.Markdown("""
433
+ ### πŸ’‘ Example Questions:
434
+ - What are the main topics covered in these documents?
435
+ - Can you summarize the key findings?
436
+ - What data is available for [specific topic]?
437
+ - What are the differences between [X] and [Y]?
438
+ - What are the differences in the uninsured rate by state in 2022?
439
+ """)
440
+
441
+ return demo
442
+
443
+ if __name__ == "__main__":
444
+ # Check if running on HuggingFace Spaces
445
+ if os.getenv("SPACE_ID"):
446
+ demo = create_interface()
447
+ demo.launch(
448
+ server_name="0.0.0.0",
449
+ server_port=7860,
450
+ share=False
451
+ )
452
+ else:
453
+ # Local development
454
+ demo = create_interface()
455
+ demo.launch(share=True)