SimranShaikh commited on
Commit
52bcdc8
·
verified ·
1 Parent(s): 6b92160
Files changed (1) hide show
  1. app.py +412 -48
app.py CHANGED
@@ -1,64 +1,428 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
38
 
39
- response += token
40
- yield response
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
+ # app.py - Main Hugging Face Spaces Application
2
  import gradio as gr
3
+ import PyPDF2
4
+ import pdfplumber
5
+ import fitz # PyMuPDF
6
+ import pandas as pd
7
+ import re
8
+ import logging
9
+ import os
10
+ import tempfile
11
+ from typing import Dict, List, Tuple, Optional
12
+ from pathlib import Path
13
+ import json
14
 
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
 
18
 
19
+ class PDFProcessorError(Exception):
20
+ """Custom exception for PDF processing errors"""
21
+ pass
22
 
23
+ def enhanced_pdf_processor(file_path: str) -> Dict:
24
+ """
25
+ Enhanced PDF processor for Hugging Face deployment
26
+ """
27
+ results = {
28
+ 'text': '',
29
+ 'tables': [],
30
+ 'metadata': {},
31
+ 'extraction_method': 'unknown',
32
+ 'success': False,
33
+ 'error': None,
34
+ 'file_info': {},
35
+ 'summary': ''
36
+ }
37
+
38
+ try:
39
+ # Validate file
40
+ if not os.path.exists(file_path):
41
+ results['error'] = f"File does not exist: {file_path}"
42
+ return results
43
+
44
+ # Get file info
45
+ results['file_info'] = get_file_info(file_path)
46
+
47
+ # Try different extraction methods
48
+ extraction_methods = [
49
+ ('PyMuPDF', extract_with_pymupdf),
50
+ ('pdfplumber', extract_with_pdfplumber),
51
+ ('PyPDF2', extract_with_pypdf2)
52
+ ]
53
+
54
+ for method_name, method_func in extraction_methods:
55
+ try:
56
+ logger.info(f"Trying extraction method: {method_name}")
57
+
58
+ if method_name == 'pdfplumber':
59
+ text_result, tables = method_func(file_path)
60
+ if text_result and len(text_result.strip()) > 10:
61
+ results['text'] = text_result
62
+ results['tables'] = tables
63
+ results['extraction_method'] = method_name
64
+ results['success'] = True
65
+ break
66
+
67
+ elif method_name == 'PyMuPDF':
68
+ text_result, metadata = method_func(file_path)
69
+ if text_result and len(text_result.strip()) > 10:
70
+ results['text'] = text_result
71
+ results['metadata'] = metadata
72
+ results['extraction_method'] = method_name
73
+ results['success'] = True
74
+ break
75
+
76
+ else: # PyPDF2
77
+ text_result = method_func(file_path)
78
+ if text_result and len(text_result.strip()) > 10:
79
+ results['text'] = text_result
80
+ results['extraction_method'] = method_name
81
+ results['success'] = True
82
+ break
83
+
84
+ except Exception as e:
85
+ logger.warning(f"{method_name} failed: {str(e)}")
86
+ continue
87
+
88
+ # Generate summary if successful
89
+ if results['success']:
90
+ results['summary'] = generate_document_summary(results['text'])
91
+ else:
92
+ results['error'] = "All extraction methods failed"
93
+
94
+ except Exception as e:
95
+ results['error'] = f"Processing error: {str(e)}"
96
+ logger.error(f"PDF processing error: {e}")
97
+
98
+ return results
99
 
100
+ def extract_with_pypdf2(file_path: str) -> str:
101
+ """Extract text using PyPDF2"""
102
+ text = ""
103
+ try:
104
+ with open(file_path, 'rb') as file:
105
+ reader = PyPDF2.PdfReader(file)
106
+
107
+ if reader.is_encrypted:
108
+ try:
109
+ reader.decrypt("")
110
+ except:
111
+ raise PDFProcessorError("PDF is encrypted")
112
+
113
+ for page_num, page in enumerate(reader.pages):
114
+ try:
115
+ page_text = page.extract_text()
116
+ if page_text:
117
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
118
+ except Exception as e:
119
+ logger.warning(f"Failed to extract page {page_num + 1}: {e}")
120
+
121
+ return clean_text(text)
122
+
123
+ except Exception as e:
124
+ raise PDFProcessorError(f"PyPDF2 extraction failed: {e}")
125
 
126
+ def extract_with_pdfplumber(file_path: str) -> Tuple[str, List[Dict]]:
127
+ """Extract text and tables using pdfplumber"""
128
+ text = ""
129
+ tables = []
130
+
131
+ try:
132
+ with pdfplumber.open(file_path) as pdf:
133
+ for page_num, page in enumerate(pdf.pages):
134
+ try:
135
+ # Extract text
136
+ page_text = page.extract_text()
137
+ if page_text:
138
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
139
+
140
+ # Extract tables
141
+ page_tables = page.extract_tables()
142
+ for table_num, table in enumerate(page_tables):
143
+ if table and len(table) > 1:
144
+ tables.append({
145
+ 'page': page_num + 1,
146
+ 'table_number': table_num + 1,
147
+ 'data': table,
148
+ 'text_representation': table_to_text(table)
149
+ })
150
+
151
+ except Exception as e:
152
+ logger.warning(f"Failed to process page {page_num + 1}: {e}")
153
+
154
+ return clean_text(text), tables
155
+
156
+ except Exception as e:
157
+ raise PDFProcessorError(f"pdfplumber extraction failed: {e}")
158
 
159
+ def extract_with_pymupdf(file_path: str) -> Tuple[str, Dict]:
160
+ """Extract text using PyMuPDF"""
161
+ text = ""
162
+ metadata = {}
163
+
164
+ try:
165
+ doc = fitz.open(file_path)
166
+
167
+ # Extract metadata
168
+ try:
169
+ doc_metadata = doc.metadata or {}
170
+ metadata = {
171
+ 'page_count': doc.page_count,
172
+ 'title': doc_metadata.get('title', ''),
173
+ 'author': doc_metadata.get('author', ''),
174
+ 'subject': doc_metadata.get('subject', ''),
175
+ 'creator': doc_metadata.get('creator', ''),
176
+ 'creation_date': doc_metadata.get('creationDate', '')
177
+ }
178
+ except Exception as e:
179
+ metadata = {'page_count': doc.page_count}
180
+
181
+ # Extract text
182
+ for page_num in range(doc.page_count):
183
+ try:
184
+ page = doc[page_num]
185
+ page_text = page.get_text()
186
+ if page_text:
187
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
188
+ except Exception as e:
189
+ logger.warning(f"Failed to extract page {page_num + 1}: {e}")
190
+
191
+ doc.close()
192
+ return clean_text(text), metadata
193
+
194
+ except Exception as e:
195
+ raise PDFProcessorError(f"PyMuPDF extraction failed: {e}")
196
 
197
+ def clean_text(text: str) -> str:
198
+ """Clean extracted text"""
199
+ if not text:
200
+ return ""
201
+
202
+ # Remove excessive whitespace
203
+ text = re.sub(r'\n\s*\n', '\n\n', text)
204
+ text = re.sub(r' +', ' ', text)
205
+
206
+ # Remove problematic characters
207
+ text = text.replace('\ufffd', '')
208
+ text = text.replace('\x00', '')
209
+ text = text.replace('\u200b', '')
210
+
211
+ return text.strip()
212
 
213
+ def table_to_text(table: List[List]) -> str:
214
+ """Convert table to text"""
215
+ if not table:
216
+ return ""
217
+
218
+ text_lines = []
219
+ for row in table:
220
+ if row:
221
+ clean_row = [str(cell).strip() if cell else "" for cell in row]
222
+ if any(clean_row):
223
+ text_lines.append(" | ".join(clean_row))
224
+
225
+ return "\n".join(text_lines)
226
 
227
+ def get_file_info(file_path: str) -> Dict:
228
+ """Get file information"""
229
+ try:
230
+ path = Path(file_path)
231
+ stat = path.stat()
232
+ return {
233
+ 'name': path.name,
234
+ 'size': stat.st_size,
235
+ 'size_mb': round(stat.st_size / (1024 * 1024), 2)
236
+ }
237
+ except Exception:
238
+ return {}
239
 
240
+ def generate_document_summary(text: str) -> str:
241
+ """Generate a simple document summary"""
242
+ if not text:
243
+ return "No text extracted"
244
+
245
+ # Basic statistics
246
+ words = len(text.split())
247
+ lines = len(text.split('\n'))
248
+ chars = len(text)
249
+
250
+ # Extract first few sentences for preview
251
+ sentences = re.split(r'[.!?]+', text)
252
+ preview = '. '.join(sentences[:3]).strip()
253
+ if len(preview) > 300:
254
+ preview = preview[:300] + "..."
255
+
256
+ return f"""
257
+ Document Statistics:
258
+ - Characters: {chars:,}
259
+ - Words: {words:,}
260
+ - Lines: {lines:,}
261
+
262
+ Preview:
263
+ {preview}
264
  """
265
+
266
+ def process_pdf_file(file) -> Tuple[str, str, str, str]:
267
+ """
268
+ Process uploaded PDF file for Gradio interface
269
+ """
270
+ if file is None:
271
+ return "No file uploaded", "", "", ""
272
+
273
+ try:
274
+ # Create temporary file
275
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
276
+ tmp_file.write(file.read())
277
+ tmp_file_path = tmp_file.name
278
+
279
+ # Process the PDF
280
+ result = enhanced_pdf_processor(tmp_file_path)
281
+
282
+ # Clean up
283
+ os.unlink(tmp_file_path)
284
+
285
+ if result['success']:
286
+ # Format results for display
287
+ status = f"✅ Successfully processed using {result['extraction_method']}"
288
+
289
+ # File info
290
+ file_info = result.get('file_info', {})
291
+ info = f"""
292
+ File: {file_info.get('name', 'Unknown')}
293
+ Size: {file_info.get('size_mb', 0)} MB
294
+ Pages: {result.get('metadata', {}).get('page_count', 'Unknown')}
295
  """
296
+
297
+ # Summary
298
+ summary = result.get('summary', 'No summary available')
299
+
300
+ # Full text (truncated for display)
301
+ full_text = result['text']
302
+ if len(full_text) > 5000:
303
+ display_text = full_text[:5000] + f"\n\n... (Text truncated. Total length: {len(full_text)} characters)"
304
+ else:
305
+ display_text = full_text
306
+
307
+ # Tables info
308
+ if result['tables']:
309
+ tables_info = f"\n\nTables found: {len(result['tables'])}"
310
+ for i, table in enumerate(result['tables'][:3]): # Show first 3 tables
311
+ tables_info += f"\n\nTable {i+1} (Page {table['page']}):\n"
312
+ tables_info += table['text_representation'][:500]
313
+ if len(table['text_representation']) > 500:
314
+ tables_info += "..."
315
+ display_text += tables_info
316
+
317
+ return status, info, summary, display_text
318
+
319
+ else:
320
+ error_msg = result.get('error', 'Unknown error')
321
+ return f"❌ Processing failed: {error_msg}", "", "", ""
322
+
323
+ except Exception as e:
324
+ return f"❌ Error: {str(e)}", "", "", ""
325
+
326
+ def answer_question(text: str, question: str) -> str:
327
+ """
328
+ Simple keyword-based question answering
329
+ """
330
+ if not text or not question:
331
+ return "Please provide both text and a question."
332
+
333
+ # Convert to lowercase for searching
334
+ text_lower = text.lower()
335
+ question_lower = question.lower()
336
+
337
+ # Extract keywords from question
338
+ keywords = [word for word in question_lower.split() if len(word) > 3]
339
+
340
+ # Find relevant sentences
341
+ sentences = re.split(r'[.!?]+', text)
342
+ relevant_sentences = []
343
+
344
+ for sentence in sentences:
345
+ sentence_lower = sentence.lower()
346
+ score = sum(1 for keyword in keywords if keyword in sentence_lower)
347
+ if score > 0:
348
+ relevant_sentences.append((sentence.strip(), score))
349
+
350
+ # Sort by relevance and take top 3
351
+ relevant_sentences.sort(key=lambda x: x[1], reverse=True)
352
+ top_sentences = [sent[0] for sent in relevant_sentences[:3]]
353
+
354
+ if top_sentences:
355
+ return f"Based on the document, here are the most relevant sections:\n\n" + "\n\n".join(top_sentences)
356
+ else:
357
+ return "I couldn't find information related to your question in the document."
358
+
359
+ # Global variable to store extracted text
360
+ extracted_text = ""
361
+
362
+ def update_extracted_text(status, info, summary, full_text):
363
+ """Update global extracted text variable"""
364
+ global extracted_text
365
+ extracted_text = full_text
366
+ return status, info, summary, full_text
367
+
368
+ def qa_interface(question):
369
+ """Interface for question answering"""
370
+ global extracted_text
371
+ return answer_question(extracted_text, question)
372
 
373
+ # Create Gradio interface
374
+ with gr.Blocks(title="PDF Processor & Q&A System") as app:
375
+ gr.Markdown("# 📄 PDF Processor & Question Answering System")
376
+ gr.Markdown("Upload a PDF file to extract text and ask questions about its content.")
377
+
378
+ with gr.Tab("PDF Processing"):
379
+ with gr.Row():
380
+ with gr.Column():
381
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
382
+ process_btn = gr.Button("Process PDF", variant="primary")
383
+
384
+ with gr.Column():
385
+ status_output = gr.Textbox(label="Status", lines=2)
386
+ info_output = gr.Textbox(label="File Information", lines=4)
387
+
388
+ summary_output = gr.Textbox(label="Document Summary", lines=8)
389
+ text_output = gr.Textbox(label="Extracted Text", lines=15, max_lines=20)
390
+
391
+ with gr.Tab("Question & Answer"):
392
+ gr.Markdown("Ask questions about the processed PDF content.")
393
+ with gr.Row():
394
+ question_input = gr.Textbox(label="Your Question", placeholder="What is this document about?")
395
+ ask_btn = gr.Button("Ask Question", variant="primary")
396
+
397
+ answer_output = gr.Textbox(label="Answer", lines=8)
398
+
399
+ # Event handlers
400
+ process_btn.click(
401
+ fn=process_pdf_file,
402
+ inputs=[file_input],
403
+ outputs=[status_output, info_output, summary_output, text_output]
404
+ ).then(
405
+ fn=update_extracted_text,
406
+ inputs=[status_output, info_output, summary_output, text_output],
407
+ outputs=[status_output, info_output, summary_output, text_output]
408
+ )
409
+
410
+ ask_btn.click(
411
+ fn=qa_interface,
412
+ inputs=[question_input],
413
+ outputs=[answer_output]
414
+ )
415
+
416
+ # Example
417
+ gr.Examples(
418
+ examples=[
419
+ ["What is the main topic of this document?"],
420
+ ["What are the key findings?"],
421
+ ["Who are the authors?"],
422
+ ["What is the conclusion?"]
423
+ ],
424
+ inputs=[question_input]
425
+ )
426
 
427
  if __name__ == "__main__":
428
+ app.launch()