ShayanRl commited on
Commit
8818767
Β·
verified Β·
1 Parent(s): 177b3aa

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +494 -0
app.py ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ import re
4
+ import requests
5
+ import tempfile
6
+ import os
7
+ from typing import List, Dict, Any
8
+ import html
9
+
10
+ try:
11
+ import fitz # PyMuPDF
12
+ PYMUPDF_AVAILABLE = True
13
+ except ImportError:
14
+ PYMUPDF_AVAILABLE = False
15
+ print("PyMuPDF not available, using pdfplumber only")
16
+
17
+ from dataclasses import dataclass
18
+
19
+
20
+ @dataclass
21
+ class PDFElement:
22
+ """Represents an element extracted from PDF"""
23
+ type: str
24
+ content: Any
25
+ page: int
26
+ bbox: tuple = None
27
+ style: Dict = None
28
+ level: int = None
29
+
30
+
31
+ class PDFProcessor:
32
+ """Simplified PDF processor"""
33
+
34
+ def __init__(self):
35
+ self.elements = []
36
+ self.html_content = ""
37
+ self.element_counter = 0
38
+
39
+ def process_pdf(self, pdf_url: str) -> Dict:
40
+ """Process PDF from URL"""
41
+ temp_file = None
42
+
43
+ try:
44
+ temp_file = self._download_pdf(pdf_url)
45
+
46
+ # Extract content
47
+ self.elements = self._extract_content(temp_file)
48
+ self.html_content = self._convert_to_html()
49
+
50
+ # Get summary
51
+ summary = {
52
+ 'total_elements': len(self.elements),
53
+ 'pages': max([e.page for e in self.elements]) if self.elements else 0,
54
+ 'headings': len([e for e in self.elements if e.type == 'heading']),
55
+ 'tables': len([e for e in self.elements if e.type == 'table']),
56
+ 'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
57
+ }
58
+
59
+ return summary
60
+
61
+ finally:
62
+ if temp_file and os.path.exists(temp_file):
63
+ try:
64
+ os.unlink(temp_file)
65
+ except:
66
+ pass
67
+
68
+ def _download_pdf(self, url: str) -> str:
69
+ """Download PDF from URL"""
70
+ headers = {
71
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
72
+ }
73
+
74
+ response = requests.get(url, headers=headers, timeout=30)
75
+ response.raise_for_status()
76
+
77
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
78
+ temp_file.write(response.content)
79
+ temp_file.close()
80
+
81
+ return temp_file.name
82
+
83
+ def _get_element_id(self, element_type: str) -> str:
84
+ """Generate unique ID for element"""
85
+ self.element_counter += 1
86
+ return f"{element_type}-{self.element_counter}"
87
+
88
+ def _extract_content(self, pdf_path: str) -> List[PDFElement]:
89
+ """Extract structured content from PDF"""
90
+ elements = []
91
+
92
+ if PYMUPDF_AVAILABLE:
93
+ try:
94
+ # Try PyMuPDF first for better structure detection
95
+ doc = fitz.open(pdf_path)
96
+
97
+ for page_num, page in enumerate(doc, 1):
98
+ blocks = page.get_text("dict")
99
+
100
+ for block in blocks["blocks"]:
101
+ if block["type"] == 0: # Text block
102
+ for line in block["lines"]:
103
+ for span in line["spans"]:
104
+ text = span["text"].strip()
105
+ if not text:
106
+ continue
107
+
108
+ font_size = span["size"]
109
+
110
+ # Simple classification
111
+ if font_size > 14:
112
+ element_type = "heading"
113
+ level = 1 if font_size > 18 else 2
114
+ elif re.match(r'^[\d\-\β€’\*]+\.?\s+', text):
115
+ element_type = "list"
116
+ level = None
117
+ else:
118
+ element_type = "paragraph"
119
+ level = None
120
+
121
+ elements.append(PDFElement(
122
+ type=element_type,
123
+ content=text,
124
+ page=page_num,
125
+ level=level
126
+ ))
127
+
128
+ doc.close()
129
+
130
+ # Also get tables with pdfplumber
131
+ with pdfplumber.open(pdf_path) as pdf:
132
+ for page_num, page in enumerate(pdf.pages, 1):
133
+ tables = page.extract_tables()
134
+ for table in tables:
135
+ if table:
136
+ elements.append(PDFElement(
137
+ type="table",
138
+ content=table,
139
+ page=page_num
140
+ ))
141
+
142
+ return elements
143
+
144
+ except Exception as e:
145
+ print(f"PyMuPDF failed: {e}, falling back to pdfplumber")
146
+
147
+ # Fallback to pdfplumber only
148
+ with pdfplumber.open(pdf_path) as pdf:
149
+ for page_num, page in enumerate(pdf.pages, 1):
150
+ text = page.extract_text() or ""
151
+ lines = text.split('\n')
152
+
153
+ for line in lines:
154
+ line = line.strip()
155
+ if not line:
156
+ continue
157
+
158
+ if line.isupper() and len(line) < 100:
159
+ element_type = "heading"
160
+ level = 1
161
+ elif re.match(r'^[\d\-\β€’\*]+\.?\s+', line):
162
+ element_type = "list"
163
+ level = None
164
+ else:
165
+ element_type = "paragraph"
166
+ level = None
167
+
168
+ elements.append(PDFElement(
169
+ type=element_type,
170
+ content=line,
171
+ page=page_num,
172
+ level=level
173
+ ))
174
+
175
+ # Extract tables
176
+ tables = page.extract_tables()
177
+ for table in tables:
178
+ if table:
179
+ elements.append(PDFElement(
180
+ type="table",
181
+ content=table,
182
+ page=page_num
183
+ ))
184
+
185
+ return elements
186
+
187
+ def _convert_to_html(self) -> str:
188
+ """Convert elements to HTML with IDs and styling"""
189
+ html_parts = ['''
190
+ <style>
191
+ .pdf-content {
192
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
193
+ line-height: 1.8;
194
+ color: #333;
195
+ max-width: 100%;
196
+ padding: 20px;
197
+ }
198
+ .pdf-content h1,
199
+ .pdf-content h2,
200
+ .pdf-content h3 {
201
+ color: #2c3e50;
202
+ margin: 25px 0 15px 0;
203
+ font-weight: 600;
204
+ }
205
+ .pdf-content h1 { font-size: 2em; border-bottom: 3px solid #667eea; padding-bottom: 10px; }
206
+ .pdf-content h2 { font-size: 1.6em; border-bottom: 2px solid #e0e0e0; padding-bottom: 8px; }
207
+ .pdf-content h3 { font-size: 1.3em; }
208
+ .pdf-content table {
209
+ border-collapse: collapse;
210
+ width: 100%;
211
+ margin: 20px 0;
212
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
213
+ border-radius: 8px;
214
+ overflow: hidden;
215
+ }
216
+ .pdf-content th,
217
+ .pdf-content td {
218
+ border: 1px solid #e0e0e0;
219
+ padding: 12px 15px;
220
+ text-align: left;
221
+ }
222
+ .pdf-content th {
223
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
224
+ color: white;
225
+ font-weight: 600;
226
+ text-transform: uppercase;
227
+ font-size: 0.9em;
228
+ letter-spacing: 0.5px;
229
+ }
230
+ .pdf-content tr:nth-child(even) {
231
+ background-color: #f8f9fa;
232
+ }
233
+ .pdf-content tr:hover {
234
+ background-color: #e3f2fd;
235
+ transition: background-color 0.2s;
236
+ }
237
+ .pdf-content p {
238
+ margin: 12px 0;
239
+ text-align: justify;
240
+ }
241
+ .pdf-content li {
242
+ margin: 8px 0;
243
+ margin-left: 25px;
244
+ }
245
+ .pdf-content .page-marker {
246
+ color: #666;
247
+ font-size: 0.95em;
248
+ font-weight: 600;
249
+ margin: 40px 0 20px 0;
250
+ padding: 12px 20px;
251
+ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
252
+ border-left: 5px solid #667eea;
253
+ border-radius: 4px;
254
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
255
+ }
256
+ .pdf-content ul, .pdf-content ol {
257
+ margin: 15px 0;
258
+ padding-left: 30px;
259
+ }
260
+ </style>
261
+ <div class="pdf-content">
262
+ ''']
263
+
264
+ current_page = 0
265
+ in_list = False
266
+
267
+ for elem in self.elements:
268
+ # Add page marker
269
+ if elem.page != current_page:
270
+ if in_list:
271
+ html_parts.append('</ul>')
272
+ in_list = False
273
+ current_page = elem.page
274
+ html_parts.append(f'<div class="page-marker" id="page-{current_page}">πŸ“„ Page {current_page}</div>')
275
+
276
+ if elem.type == "heading":
277
+ if in_list:
278
+ html_parts.append('</ul>')
279
+ in_list = False
280
+ level = elem.level or 2
281
+ elem_id = self._get_element_id('heading')
282
+ content = html.escape(elem.content)
283
+ html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>')
284
+
285
+ elif elem.type == "paragraph":
286
+ if in_list:
287
+ html_parts.append('</ul>')
288
+ in_list = False
289
+ elem_id = self._get_element_id('paragraph')
290
+ content = html.escape(elem.content)
291
+ html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>')
292
+
293
+ elif elem.type == "list":
294
+ if not in_list:
295
+ html_parts.append('<ul>')
296
+ in_list = True
297
+ elem_id = self._get_element_id('list-item')
298
+ content = html.escape(elem.content)
299
+ html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>')
300
+
301
+ elif elem.type == "table":
302
+ if in_list:
303
+ html_parts.append('</ul>')
304
+ in_list = False
305
+ elem_id = self._get_element_id('table')
306
+ html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">')
307
+ for i, row in enumerate(elem.content):
308
+ row_id = self._get_element_id('table-row')
309
+ html_parts.append(f'<tr id="{row_id}">')
310
+ tag = 'th' if i == 0 else 'td'
311
+ for j, cell in enumerate(row):
312
+ cell_id = self._get_element_id('table-cell')
313
+ cell_content = html.escape(str(cell)) if cell else ""
314
+ html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>')
315
+ html_parts.append('</tr>')
316
+ html_parts.append('</table>')
317
+
318
+ if in_list:
319
+ html_parts.append('</ul>')
320
+
321
+ html_parts.append('</div>')
322
+ return '\n'.join(html_parts)
323
+
324
+
325
+ # Global processor
326
+ processor = PDFProcessor()
327
+
328
+
329
+ def process_pdf_url(pdf_url):
330
+ """Process PDF from URL"""
331
+ global processor
332
+
333
+ if not pdf_url or not pdf_url.strip():
334
+ return "❌ Please enter a PDF URL", "", ""
335
+
336
+ try:
337
+ processor = PDFProcessor()
338
+ summary = processor.process_pdf(pdf_url.strip())
339
+
340
+ summary_text = f"""### βœ… PDF Processed Successfully!
341
+
342
+ **πŸ“Š Summary:**
343
+ - **Total Elements:** {summary['total_elements']}
344
+ - **Pages:** {summary['pages']}
345
+ - **Headings:** {summary['headings']}
346
+ - **Tables:** {summary['tables']}
347
+ - **Paragraphs:** {summary['paragraphs']}
348
+ """
349
+
350
+ return summary_text, processor.html_content, processor.html_content
351
+
352
+ except Exception as e:
353
+ error_msg = f"❌ Error processing PDF: {str(e)}"
354
+ return error_msg, "", ""
355
+
356
+
357
+ # Create Gradio interface
358
+ with gr.Blocks(
359
+ title="PDF to HTML Converter",
360
+ theme=gr.themes.Soft(
361
+ primary_hue="indigo",
362
+ secondary_hue="purple",
363
+ ),
364
+ css="""
365
+ .gradio-container {
366
+ max-width: 1200px !important;
367
+ }
368
+ #html_preview {
369
+ min-height: 600px;
370
+ max-height: 800px;
371
+ overflow-y: auto;
372
+ border: 1px solid #e0e0e0;
373
+ border-radius: 8px;
374
+ padding: 20px;
375
+ background: white;
376
+ }
377
+ #html_source {
378
+ font-family: 'Courier New', monospace;
379
+ font-size: 13px;
380
+ line-height: 1.5;
381
+ }
382
+ """
383
+ ) as demo:
384
+
385
+ gr.Markdown(
386
+ """
387
+ # πŸ“„ PDF to HTML Converter
388
+
389
+ Extract PDF content and view as beautifully structured HTML with unique IDs for each element.
390
+
391
+ Simply paste a PDF URL and click **Process PDF** to get started!
392
+ """
393
+ )
394
+
395
+ with gr.Row():
396
+ with gr.Column(scale=4):
397
+ pdf_url_input = gr.Textbox(
398
+ label="PDF URL",
399
+ placeholder="https://example.com/document.pdf",
400
+ lines=1,
401
+ max_lines=1
402
+ )
403
+ with gr.Column(scale=1):
404
+ process_btn = gr.Button(
405
+ "πŸš€ Process PDF",
406
+ variant="primary",
407
+ size="lg"
408
+ )
409
+
410
+ summary_output = gr.Markdown(label="Summary")
411
+
412
+ gr.Markdown("---")
413
+
414
+ with gr.Tabs():
415
+ with gr.Tab("πŸ“‹ HTML Preview"):
416
+ html_preview = gr.HTML(
417
+ label="Rendered HTML",
418
+ elem_id="html_preview"
419
+ )
420
+
421
+ with gr.Tab("πŸ’» HTML Source"):
422
+ html_source = gr.Code(
423
+ label="HTML Source Code",
424
+ language="html",
425
+ lines=25,
426
+ elem_id="html_source"
427
+ )
428
+ download_btn = gr.Button("πŸ“₯ Download HTML", variant="secondary")
429
+ download_file = gr.File(label="Download", visible=False)
430
+
431
+ # Event handlers
432
+ process_btn.click(
433
+ fn=process_pdf_url,
434
+ inputs=[pdf_url_input],
435
+ outputs=[summary_output, html_preview, html_source]
436
+ )
437
+
438
+ # Allow Enter key to process
439
+ pdf_url_input.submit(
440
+ fn=process_pdf_url,
441
+ inputs=[pdf_url_input],
442
+ outputs=[summary_output, html_preview, html_source]
443
+ )
444
+
445
+ def create_download_file(html_content):
446
+ if not html_content:
447
+ return None
448
+
449
+ # Create full HTML document
450
+ full_html = f"""<!DOCTYPE html>
451
+ <html lang="en">
452
+ <head>
453
+ <meta charset="UTF-8">
454
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
455
+ <title>Extracted PDF Content</title>
456
+ </head>
457
+ <body>
458
+ {html_content}
459
+ </body>
460
+ </html>"""
461
+
462
+ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8')
463
+ temp_file.write(full_html)
464
+ temp_file.close()
465
+ return temp_file.name
466
+
467
+ download_btn.click(
468
+ fn=create_download_file,
469
+ inputs=[html_source],
470
+ outputs=[download_file]
471
+ )
472
+
473
+ gr.Markdown(
474
+ """
475
+ ---
476
+ ### πŸ“Œ Features:
477
+ - ✨ Extracts text, tables, headings from PDFs
478
+ - 🎯 Each HTML element has a unique ID
479
+ - πŸ“Š Beautiful table styling
480
+ - πŸ”– Page markers for easy navigation
481
+ - πŸ’Ύ Download extracted HTML
482
+
483
+ ### πŸ’‘ Example PDFs to try:
484
+ - Research papers from arXiv
485
+ - Product documentation
486
+ - Financial reports
487
+ - Any publicly accessible PDF!
488
+ """
489
+ )
490
+
491
+
492
+ # Launch
493
+ if __name__ == "__main__":
494
+ demo.launch()