msmaje commited on
Commit
6a0c640
·
verified ·
1 Parent(s): 6c0a884

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -557
app.py CHANGED
@@ -11,239 +11,121 @@ logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
  try:
14
- from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.prompts import PromptTemplate
19
  from langchain.chains import RetrievalQA
20
- from langchain_community.llms import HuggingFaceEndpoint
21
  LANGCHAIN_AVAILABLE = True
22
  except ImportError as e:
23
  logger.error(f"LangChain import error: {e}")
24
  LANGCHAIN_AVAILABLE = False
25
 
26
- # Create PDFs folder if it doesn't exist
27
  PDF_FOLDER_PATH = "./pdfs"
28
  os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
29
 
30
- # Global variables for the RAG system
31
  vectorstore = None
32
  retrieval_qa = None
33
  embedding_model = None
34
-
35
- # Check for pre-existing PDF folder
36
  PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
37
 
38
  def initialize_models():
39
- """Initialize the embedding model and LLM"""
40
  global embedding_model
41
-
42
  try:
43
- # Initialize embedding model
44
  embedding_model = HuggingFaceEmbeddings(
45
  model_name="sentence-transformers/all-MiniLM-L6-v2",
46
  model_kwargs={'device': 'cpu'}
47
  )
48
-
49
- # Get HuggingFace token from environment
50
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
51
  if not hf_token:
52
- return False, "❌ HuggingFace API token not found in environment variables"
53
-
54
- return True, "✅ Models initialized successfully"
55
-
56
  except Exception as e:
57
- logger.error(f"Model initialization error: {e}")
58
- return False, f"❌ Error initializing models: {str(e)}"
59
 
60
  def create_llm():
61
- """Create and return the LLM instance"""
62
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
63
-
64
- llm = HuggingFaceEndpoint(
65
- repo_id="google/flan-t5-base",
66
- temperature=0.7,
67
- max_new_tokens=512,
68
- huggingfacehub_api_token=hf_token
69
- )
70
-
71
- return llm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
74
- """Load PDFs from the pre-existing folder"""
75
  global vectorstore, retrieval_qa, embedding_model
76
-
77
  if not LANGCHAIN_AVAILABLE:
78
- return "❌ LangChain is not available. Please check the installation."
79
-
80
  if not PRELOADED_PDFS:
81
- return "❌ No pre-loaded PDFs found in ./pdfs folder."
82
-
83
  try:
84
- # Initialize models if not already done
85
  if embedding_model is None:
86
- success, message = initialize_models()
87
  if not success:
88
- return message
89
-
90
- # Load documents from pre-existing folder
91
  loader = PyPDFDirectoryLoader(PDF_FOLDER_PATH)
92
  documents = loader.load()
93
-
94
  if not documents:
95
- return "❌ No documents were loaded from the PDFs folder."
96
-
97
- # Split documents into chunks
98
- text_splitter = RecursiveCharacterTextSplitter(
99
- chunk_size=int(chunk_size),
100
- chunk_overlap=int(chunk_overlap)
101
- )
102
- chunks = text_splitter.split_documents(documents)
103
-
104
- # Create vector store
105
- vectorstore = FAISS.from_documents(chunks, embedding_model)
106
- retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
107
-
108
- # Setup prompt template
109
- prompt_template = """
110
- Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
111
-
112
- Context:
113
- {context}
114
 
115
- Question: {question}
116
-
117
- Helpful Answer:
118
- """
119
- prompt = PromptTemplate(
120
- input_variables=["context", "question"],
121
- template=prompt_template
122
- )
123
-
124
- # Initialize LLM using the new function
125
- llm = create_llm()
126
-
127
- # Create RetrievalQA chain
128
- retrieval_qa = RetrievalQA.from_chain_type(
129
- llm=llm,
130
- chain_type="stuff",
131
- retriever=retriever,
132
- return_source_documents=True,
133
- chain_type_kwargs={"prompt": prompt}
134
  )
135
-
136
- pdf_files = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
137
- return f"✅ Successfully processed {len(documents)} documents from {len(pdf_files)} PDF files into {len(chunks)} chunks. Ready for questions!"
138
-
139
- except Exception as e:
140
- logger.error(f"Pre-loaded PDF processing error: {e}")
141
- return f"❌ Error processing pre-loaded PDFs: {str(e)}"
142
-
143
- def extract_zip_to_pdfs(zip_file):
144
- """Extract uploaded ZIP file to PDFs folder"""
145
- if not zip_file:
146
- return "❌ Please upload a ZIP file."
147
-
148
- try:
149
- # Create PDFs directory if it doesn't exist
150
- os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
151
-
152
- # Extract ZIP file
153
- with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
154
- # Extract only PDF files
155
- pdf_files = [f for f in zip_ref.namelist() if f.lower().endswith('.pdf')]
156
-
157
- if not pdf_files:
158
- return "❌ No PDF files found in the ZIP archive."
159
-
160
- for pdf_file in pdf_files:
161
- # Extract to PDFs folder
162
- zip_ref.extract(pdf_file, PDF_FOLDER_PATH)
163
-
164
- # If file is in a subfolder, move it to the root of PDFs folder
165
- extracted_path = os.path.join(PDF_FOLDER_PATH, pdf_file)
166
- if os.path.dirname(pdf_file): # File is in a subfolder
167
- new_path = os.path.join(PDF_FOLDER_PATH, os.path.basename(pdf_file))
168
- shutil.move(extracted_path, new_path)
169
- # Clean up empty directories
170
- try:
171
- os.rmdir(os.path.dirname(extracted_path))
172
- except:
173
- pass
174
-
175
- global PRELOADED_PDFS
176
- PRELOADED_PDFS = True
177
-
178
- return f"✅ Successfully extracted {len(pdf_files)} PDF files. Now click 'Load Pre-existing PDFs' to process them."
179
-
180
- except Exception as e:
181
- return f"❌ Error extracting ZIP file: {str(e)}"
182
-
183
- def process_pdfs(pdf_files, chunk_size, chunk_overlap):
184
- """Process uploaded PDF files and create vector store"""
185
- global vectorstore, retrieval_qa, embedding_model
186
-
187
- if not LANGCHAIN_AVAILABLE:
188
- return "❌ LangChain is not available. Please check the installation."
189
-
190
- if not pdf_files:
191
- return "❌ Please upload at least one PDF file or use pre-loaded PDFs."
192
-
193
- try:
194
- # Initialize models if not already done
195
- if embedding_model is None:
196
- success, message = initialize_models()
197
- if not success:
198
- return message
199
-
200
- # Create temporary directory for PDFs
201
- temp_dir = tempfile.mkdtemp()
202
-
203
- # Save uploaded files to temp directory
204
- for pdf_file in pdf_files:
205
- if pdf_file is not None:
206
- temp_path = os.path.join(temp_dir, os.path.basename(pdf_file.name))
207
- shutil.copy2(pdf_file.name, temp_path)
208
-
209
- # Load documents
210
- loader = PyPDFDirectoryLoader(temp_dir)
211
- documents = loader.load()
212
-
213
- if not documents:
214
- return "❌ No documents were loaded. Please check your PDF files."
215
-
216
- # Split documents into chunks
217
- text_splitter = RecursiveCharacterTextSplitter(
218
- chunk_size=int(chunk_size),
219
- chunk_overlap=int(chunk_overlap)
220
- )
221
- chunks = text_splitter.split_documents(documents)
222
-
223
- # Create vector store
224
  vectorstore = FAISS.from_documents(chunks, embedding_model)
225
  retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
226
-
227
- # Setup prompt template
228
  prompt_template = """
229
- Use the following context to answer the question. If you cannot find the answer in the context, say "I don't have enough information to answer this question."
230
 
231
  Context:
232
  {context}
233
 
234
  Question: {question}
235
 
236
- Helpful Answer:
237
  """
238
  prompt = PromptTemplate(
239
- input_variables=["context", "question"],
240
  template=prompt_template
241
  )
242
-
243
- # Initialize LLM using the new function
244
  llm = create_llm()
245
-
246
- # Create RetrievalQA chain
247
  retrieval_qa = RetrievalQA.from_chain_type(
248
  llm=llm,
249
  chain_type="stuff",
@@ -251,412 +133,65 @@ Helpful Answer:
251
  return_source_documents=True,
252
  chain_type_kwargs={"prompt": prompt}
253
  )
254
-
255
- # Clean up temp directory
256
- shutil.rmtree(temp_dir)
257
-
258
- return f"✅ Successfully processed {len(documents)} documents into {len(chunks)} chunks. Ready for questions!"
259
-
260
  except Exception as e:
261
- logger.error(f"PDF processing error: {e}")
262
- return f"❌ Error processing PDFs: {str(e)}"
263
 
264
  def answer_question(question):
265
- """Answer a question using the RAG system"""
266
  global retrieval_qa
267
-
268
  if not question.strip():
269
- return "❌ Please enter a question.", ""
270
-
271
  if retrieval_qa is None:
272
- return "❌ Please upload and process PDF files first.", ""
273
-
274
  try:
275
- # Get answer from RAG system
276
  result = retrieval_qa({"query": question})
277
-
278
- answer = result["result"]
279
-
280
- # Format source documents
281
  sources = []
282
  for i, doc in enumerate(result.get("source_documents", []), 1):
283
  source = doc.metadata.get("source", "Unknown")
284
  page = doc.metadata.get("page", "Unknown")
285
- content_preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
286
-
287
- sources.append(f"**Source {i}:**\n- File: {Path(source).name}\n- Page: {page}\n- Preview: {content_preview}\n")
288
-
289
- sources_text = "\n".join(sources) if sources else "No sources found."
290
-
291
- return answer, sources_text
292
-
293
  except Exception as e:
294
- logger.error(f"Question answering error: {e}")
295
- return f"❌ Error answering question: {str(e)}", ""
296
-
297
- def get_device_info():
298
- """Simple function to detect if mobile (basic detection)"""
299
- return """
300
- <script>
301
- function isMobile() {
302
- return window.innerWidth <= 768;
303
- }
304
-
305
- function adjustLayout() {
306
- const isMob = isMobile();
307
- const root = document.documentElement;
308
- if (isMob) {
309
- root.style.setProperty('--mobile-mode', '1');
310
- } else {
311
- root.style.setProperty('--mobile-mode', '0');
312
- }
313
- }
314
-
315
- window.addEventListener('resize', adjustLayout);
316
- adjustLayout();
317
- </script>
318
- """
319
 
320
  def create_interface():
321
- """Create the fully responsive Gradio interface"""
322
-
323
- # Custom CSS for better responsiveness
324
- custom_css = """
325
- /* Base responsive styles */
326
- .gradio-container {
327
- max-width: 100% !important;
328
- margin: 0 auto;
329
- padding: 10px;
330
- }
331
-
332
- /* Mobile-first responsive design */
333
- @media (max-width: 768px) {
334
- .gradio-container {
335
- padding: 5px;
336
- }
337
-
338
- /* Stack elements vertically on mobile */
339
- .gr-row {
340
- flex-direction: column !important;
341
- gap: 10px !important;
342
- }
343
-
344
- /* Full width on mobile */
345
- .gr-column {
346
- width: 100% !important;
347
- min-width: 100% !important;
348
- }
349
-
350
- /* Adjust component spacing */
351
- .gr-form > * {
352
- margin-bottom: 8px !important;
353
- }
354
-
355
- /* Better button sizing */
356
- .gr-button {
357
- width: 100% !important;
358
- min-height: 44px !important;
359
- font-size: 14px !important;
360
- }
361
-
362
- /* Text input improvements */
363
- .gr-textbox textarea {
364
- min-height: 60px !important;
365
- font-size: 16px !important; /* Prevents zoom on iOS */
366
- }
367
-
368
- /* File upload improvements */
369
- .gr-file {
370
- min-height: 100px !important;
371
- }
372
-
373
- /* Slider improvements */
374
- .gr-slider {
375
- margin: 10px 0 !important;
376
- }
377
-
378
- /* Tab improvements */
379
- .gr-tab-nav {
380
- flex-wrap: wrap !important;
381
- }
382
-
383
- .gr-tab-nav > button {
384
- flex: 1 1 auto !important;
385
- min-width: 80px !important;
386
- font-size: 12px !important;
387
- }
388
- }
389
-
390
- /* Tablet styles */
391
- @media (min-width: 769px) and (max-width: 1024px) {
392
- .gradio-container {
393
- padding: 15px;
394
- }
395
-
396
- .gr-button {
397
- min-height: 40px !important;
398
- }
399
- }
400
-
401
- /* Desktop styles */
402
- @media (min-width: 1025px) {
403
- .gradio-container {
404
- max-width: 1400px;
405
- padding: 20px;
406
- }
407
- }
408
-
409
- /* Improve readability */
410
- .gr-markdown h1 {
411
- font-size: clamp(1.5rem, 4vw, 2.5rem) !important;
412
- line-height: 1.2 !important;
413
- margin-bottom: 1rem !important;
414
- }
415
-
416
- .gr-markdown h3 {
417
- font-size: clamp(1.1rem, 3vw, 1.4rem) !important;
418
- margin: 1rem 0 0.5rem 0 !important;
419
- }
420
-
421
- .gr-markdown p, .gr-markdown li {
422
- font-size: clamp(0.9rem, 2.5vw, 1rem) !important;
423
- line-height: 1.5 !important;
424
- }
425
-
426
- /* Status text improvements */
427
- .gr-textbox[data-testid="textbox"] {
428
- font-family: monospace !important;
429
- font-size: clamp(0.8rem, 2vw, 0.9rem) !important;
430
- }
431
-
432
- /* Accessibility improvements */
433
- .gr-button:focus,
434
- .gr-textbox:focus,
435
- .gr-file:focus {
436
- outline: 2px solid #2563eb !important;
437
- outline-offset: 2px !important;
438
- }
439
-
440
- /* Dark mode considerations */
441
- @media (prefers-color-scheme: dark) {
442
- .gr-button {
443
- border: 1px solid #374151 !important;
444
- }
445
- }
446
- """
447
-
448
- with gr.Blocks(
449
- title="PDF RAG System",
450
- theme=gr.themes.Soft(),
451
- css=custom_css
452
- ) as demo:
453
-
454
- # Add device detection script
455
- gr.HTML(get_device_info())
456
-
457
- gr.Markdown("""
458
- # 📚 PDF Question Answering System
459
-
460
- Upload your PDF documents and ask questions about their content!
461
-
462
- **Quick Start:**
463
- 1. Upload PDFs or use pre-loaded ones
464
- 2. Click Process to prepare your documents
465
- 3. Ask questions about the content
466
- """)
467
-
468
- # Check for pre-loaded PDFs
469
- if PRELOADED_PDFS:
470
- gr.Markdown("""
471
- <div style="background: linear-gradient(90deg, #10b981, #059669);
472
- color: white; padding: 12px; border-radius: 8px; margin: 10px 0;">
473
- 🎉 <strong>Pre-loaded PDFs detected!</strong> Use the 'Load Pre-existing PDFs' button to get started quickly.
474
- </div>
475
- """)
476
-
477
- # Main layout - responsive columns
478
  with gr.Row():
479
- # Left column - Upload & Settings (collapses to full width on mobile)
480
- with gr.Column(scale=1, min_width=300):
481
- gr.Markdown("### 📄 Document Management")
482
-
483
- with gr.Tabs():
484
- with gr.TabItem("📁 Upload PDFs"):
485
- pdf_files = gr.File(
486
- label="Select PDF Files",
487
- file_count="multiple",
488
- file_types=[".pdf"],
489
- height=120
490
- )
491
- process_btn = gr.Button(
492
- "🔄 Process PDFs",
493
- variant="primary",
494
- size="lg"
495
- )
496
-
497
- with gr.TabItem("🗂️ ZIP Upload"):
498
- zip_file = gr.File(
499
- label="Upload ZIP (with PDFs)",
500
- file_count="single",
501
- file_types=[".zip"],
502
- height=80
503
- )
504
- extract_btn = gr.Button(
505
- "📦 Extract ZIP",
506
- variant="secondary",
507
- size="lg"
508
- )
509
- extract_output = gr.Textbox(
510
- label="Extraction Status",
511
- lines=2,
512
- max_lines=3
513
- )
514
-
515
- with gr.TabItem("💾 Pre-loaded"):
516
- if PRELOADED_PDFS:
517
- pdf_list = [f for f in os.listdir(PDF_FOLDER_PATH) if f.endswith('.pdf')]
518
- gr.Markdown(f"**Found {len(pdf_list)} PDF files**")
519
-
520
- # Show files in a more mobile-friendly way
521
- if len(pdf_list) <= 5:
522
- for pdf in pdf_list:
523
- gr.Markdown(f"📄 {pdf}")
524
- else:
525
- for pdf in pdf_list[:3]:
526
- gr.Markdown(f"📄 {pdf}")
527
- gr.Markdown(f"*... and {len(pdf_list) - 3} more files*")
528
- else:
529
- gr.Markdown("No pre-loaded PDFs found.")
530
-
531
- preload_btn = gr.Button(
532
- "📚 Load Pre-existing PDFs",
533
- variant="primary",
534
- size="lg",
535
- interactive=PRELOADED_PDFS
536
- )
537
-
538
- # Settings section - collapsible on mobile
539
- with gr.Accordion("⚙️ Advanced Settings", open=False):
540
- chunk_size = gr.Slider(
541
- minimum=200,
542
- maximum=2000,
543
- value=1000,
544
- step=100,
545
- label="Chunk Size",
546
- info="Larger chunks = more context, smaller = more precise"
547
- )
548
-
549
- chunk_overlap = gr.Slider(
550
- minimum=0,
551
- maximum=500,
552
- value=200,
553
- step=50,
554
- label="Chunk Overlap",
555
- info="Overlap between text chunks"
556
- )
557
-
558
- # Status display
559
- process_output = gr.Textbox(
560
- label="📊 Processing Status",
561
- lines=3,
562
- max_lines=5,
563
- placeholder="Status updates will appear here..."
564
- )
565
-
566
- # Right column - Q&A Section (collapses to full width on mobile)
567
- with gr.Column(scale=2, min_width=400):
568
- gr.Markdown("### ❓ Ask Questions")
569
-
570
- question_input = gr.Textbox(
571
- label="Your Question",
572
- placeholder="What would you like to know about your documents?",
573
- lines=2,
574
- max_lines=4
575
- )
576
-
577
- ask_btn = gr.Button(
578
- "🤔 Ask Question",
579
- variant="secondary",
580
- size="lg"
581
- )
582
-
583
- # Results section - stack vertically on mobile
584
- with gr.Row():
585
- answer_output = gr.Textbox(
586
- label="💡 Answer",
587
- lines=6,
588
- max_lines=12,
589
- placeholder="Your answer will appear here..."
590
- )
591
-
592
- sources_output = gr.Textbox(
593
- label="📚 Sources",
594
- lines=6,
595
- max_lines=12,
596
- placeholder="Source references will appear here..."
597
- )
598
-
599
- # Event handlers (unchanged)
600
  process_btn.click(
601
- fn=process_pdfs,
602
- inputs=[pdf_files, chunk_size, chunk_overlap],
603
- outputs=[process_output]
604
- )
605
-
606
- preload_btn.click(
607
  fn=load_preloaded_pdfs,
608
  inputs=[chunk_size, chunk_overlap],
609
  outputs=[process_output]
610
  )
611
-
612
- extract_btn.click(
613
- fn=extract_zip_to_pdfs,
614
- inputs=[zip_file],
615
- outputs=[extract_output]
616
- )
617
-
618
  ask_btn.click(
619
  fn=answer_question,
620
- inputs=[question_input],
621
- outputs=[answer_output, sources_output]
622
- )
623
-
624
- question_input.submit(
625
- fn=answer_question,
626
- inputs=[question_input],
627
- outputs=[answer_output, sources_output]
628
  )
629
-
630
- # Example questions - more mobile-friendly
631
- with gr.Accordion("💡 Example Questions", open=False):
632
- gr.Markdown("""
633
- **Try asking:**
634
- - What are the main topics in these documents?
635
- - Can you summarize the key findings?
636
- - What data is available for [specific topic]?
637
- - What are the differences between X and Y?
638
- """)
639
-
640
- # Footer with helpful info
641
- gr.Markdown("""
642
- ---
643
- <div style="text-align: center; color: #666; font-size: 0.9em;">
644
- 💡 <strong>Tip:</strong> For best results, ask specific questions about your documents
645
- </div>
646
- """)
647
-
648
  return demo
649
 
650
  if __name__ == "__main__":
651
- # Check if running on HuggingFace Spaces
652
- if os.getenv("SPACE_ID"):
653
- demo = create_interface()
654
- demo.launch(
655
- server_name="0.0.0.0",
656
- server_port=7860,
657
- share=False
658
- )
659
- else:
660
- # Local development
661
- demo = create_interface()
662
- demo.launch(share=True)
 
11
  logger = logging.getLogger(__name__)
12
 
13
  try:
14
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
  from langchain_community.embeddings import HuggingFaceEmbeddings
17
  from langchain_community.vectorstores import FAISS
18
  from langchain.prompts import PromptTemplate
19
  from langchain.chains import RetrievalQA
20
+ from langchain.llms import HuggingFaceHub
21
  LANGCHAIN_AVAILABLE = True
22
  except ImportError as e:
23
  logger.error(f"LangChain import error: {e}")
24
  LANGCHAIN_AVAILABLE = False
25
 
 
26
  PDF_FOLDER_PATH = "./pdfs"
27
  os.makedirs(PDF_FOLDER_PATH, exist_ok=True)
28
 
 
29
  vectorstore = None
30
  retrieval_qa = None
31
  embedding_model = None
 
 
32
  PRELOADED_PDFS = os.path.exists(PDF_FOLDER_PATH) and len(os.listdir(PDF_FOLDER_PATH)) > 0
33
 
34
  def initialize_models():
 
35
  global embedding_model
 
36
  try:
 
37
  embedding_model = HuggingFaceEmbeddings(
38
  model_name="sentence-transformers/all-MiniLM-L6-v2",
39
  model_kwargs={'device': 'cpu'}
40
  )
41
+
 
42
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
43
  if not hf_token:
44
+ return False, "❌ HuggingFace API token not found"
45
+
46
+ return True, "✅ Models initialized"
 
47
  except Exception as e:
48
+ logger.error(f"Init error: {e}")
49
+ return False, str(e)
50
 
51
  def create_llm():
 
52
  hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
53
+ if not hf_token:
54
+ return create_fallback_llm()
55
+
56
+ models_to_try = [
57
+ "mistralai/Mistral-7B-Instruct-v0.2",
58
+ "google/flan-t5-base"
59
+ ]
60
+
61
+ for model_id in models_to_try:
62
+ try:
63
+ llm = HuggingFaceHub(
64
+ repo_id=model_id,
65
+ huggingfacehub_api_token=hf_token,
66
+ model_kwargs={
67
+ "temperature": 0.7,
68
+ "max_length": 512,
69
+ "top_p": 0.9,
70
+ "top_k": 50
71
+ }
72
+ )
73
+ return llm
74
+ except Exception as e:
75
+ logger.warning(f"Model {model_id} failed: {e}")
76
+ return create_fallback_llm()
77
+
78
+ def create_fallback_llm():
79
+ class FallbackLLM:
80
+ def __call__(self, prompt):
81
+ return "Model is unavailable. Try again later."
82
+ def invoke(self, prompt):
83
+ return self.__call__(prompt)
84
+ return FallbackLLM()
85
 
86
  def load_preloaded_pdfs(chunk_size=1000, chunk_overlap=200):
 
87
  global vectorstore, retrieval_qa, embedding_model
88
+
89
  if not LANGCHAIN_AVAILABLE:
90
+ return "❌ LangChain not available"
91
+
92
  if not PRELOADED_PDFS:
93
+ return "❌ No PDFs found"
94
+
95
  try:
 
96
  if embedding_model is None:
97
+ success, msg = initialize_models()
98
  if not success:
99
+ return msg
100
+
 
101
  loader = PyPDFDirectoryLoader(PDF_FOLDER_PATH)
102
  documents = loader.load()
 
103
  if not documents:
104
+ return "❌ No documents loaded"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ splitter = RecursiveCharacterTextSplitter(
107
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  )
109
+ chunks = splitter.split_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  vectorstore = FAISS.from_documents(chunks, embedding_model)
111
  retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
112
+
 
113
  prompt_template = """
114
+ Use the following context to answer the question. If you cannot find the answer, say so.
115
 
116
  Context:
117
  {context}
118
 
119
  Question: {question}
120
 
121
+ Answer:
122
  """
123
  prompt = PromptTemplate(
124
+ input_variables=["context", "question"],
125
  template=prompt_template
126
  )
127
+
 
128
  llm = create_llm()
 
 
129
  retrieval_qa = RetrievalQA.from_chain_type(
130
  llm=llm,
131
  chain_type="stuff",
 
133
  return_source_documents=True,
134
  chain_type_kwargs={"prompt": prompt}
135
  )
136
+
137
+ return f"✅ {len(documents)} docs loaded, {len(chunks)} chunks"
 
 
 
 
138
  except Exception as e:
139
+ return f" Error: {str(e)}"
 
140
 
141
  def answer_question(question):
 
142
  global retrieval_qa
 
143
  if not question.strip():
144
+ return "❌ Enter a question", ""
 
145
  if retrieval_qa is None:
146
+ return "❌ Process documents first", ""
147
+
148
  try:
 
149
  result = retrieval_qa({"query": question})
150
+ answer = result.get("result", "No answer")
151
+
 
 
152
  sources = []
153
  for i, doc in enumerate(result.get("source_documents", []), 1):
154
  source = doc.metadata.get("source", "Unknown")
155
  page = doc.metadata.get("page", "Unknown")
156
+ preview = doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
157
+ sources.append(f"**Source {i}:** {Path(source).name} (Page {page})\n{preview}")
158
+
159
+ return answer, "\n\n".join(sources)
 
 
 
 
160
  except Exception as e:
161
+ return f" Error: {str(e)}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  def create_interface():
164
+ with gr.Blocks(title="RAG PDF QA") as demo:
165
+ gr.Markdown("## PDF QA with LangChain + HuggingFaceHub")
166
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  with gr.Row():
168
+ with gr.Column():
169
+ pdf_files = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload PDFs")
170
+ chunk_size = gr.Slider(200, 2000, value=1000, label="Chunk Size")
171
+ chunk_overlap = gr.Slider(0, 500, value=200, label="Chunk Overlap")
172
+ process_btn = gr.Button("🔄 Process PDFs")
173
+ process_output = gr.Textbox(label="Processing Result")
174
+
175
+ with gr.Column():
176
+ question = gr.Textbox(label="Ask a Question")
177
+ ask_btn = gr.Button("🤔 Ask")
178
+ answer = gr.Textbox(label="Answer")
179
+ sources = gr.Textbox(label="Sources")
180
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  process_btn.click(
 
 
 
 
 
 
182
  fn=load_preloaded_pdfs,
183
  inputs=[chunk_size, chunk_overlap],
184
  outputs=[process_output]
185
  )
186
+
 
 
 
 
 
 
187
  ask_btn.click(
188
  fn=answer_question,
189
+ inputs=[question],
190
+ outputs=[answer, sources]
 
 
 
 
 
 
191
  )
192
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  return demo
194
 
195
  if __name__ == "__main__":
196
+ demo = create_interface()
197
+ demo.launch(share=True)