pdf_F / app.py
ShayanRl's picture
Update app.py
662bd69 verified
import gradio as gr
import pdfplumber
import re
import requests
import tempfile
import os
from typing import List, Dict, Any
import html
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
print("PyMuPDF not available, using pdfplumber only")
from dataclasses import dataclass
@dataclass
class PDFElement:
"""Represents an element extracted from PDF"""
type: str
content: Any
page: int
bbox: tuple = None
style: Dict = None
level: int = None
class PDFProcessor:
"""Simplified PDF processor"""
def __init__(self):
self.elements = []
self.html_content = ""
self.element_counter = 0
def process_pdf(self, pdf_url: str) -> Dict:
"""Process PDF from URL"""
temp_file = None
try:
temp_file = self._download_pdf(pdf_url)
# Extract content
self.elements = self._extract_content(temp_file)
self.html_content = self._convert_to_html()
# Get summary
summary = {
'total_elements': len(self.elements),
'pages': max([e.page for e in self.elements]) if self.elements else 0,
'headings': len([e for e in self.elements if e.type == 'heading']),
'tables': len([e for e in self.elements if e.type == 'table']),
'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
}
return summary
finally:
if temp_file and os.path.exists(temp_file):
try:
os.unlink(temp_file)
except:
pass
def _download_pdf(self, url: str) -> str:
"""Download PDF from URL"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
temp_file.write(response.content)
temp_file.close()
return temp_file.name
def _get_element_id(self, element_type: str) -> str:
"""Generate unique ID for element"""
self.element_counter += 1
return f"{element_type}-{self.element_counter}"
def _extract_content(self, pdf_path: str) -> List[PDFElement]:
"""Extract structured content from PDF"""
elements = []
if PYMUPDF_AVAILABLE:
try:
# Try PyMuPDF first for better structure detection
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc, 1):
blocks = page.get_text("dict")
for block in blocks["blocks"]:
if block["type"] == 0: # Text block
for line in block["lines"]:
for span in line["spans"]:
text = span["text"].strip()
if not text:
continue
font_size = span["size"]
# Simple classification
if font_size > 14:
element_type = "heading"
level = 1 if font_size > 18 else 2
elif re.match(r'^[\d\-\β€’\*]+\.?\s+', text):
element_type = "list"
level = None
else:
element_type = "paragraph"
level = None
elements.append(PDFElement(
type=element_type,
content=text,
page=page_num,
level=level
))
doc.close()
# Also get tables with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
tables = page.extract_tables()
for table in tables:
if table:
elements.append(PDFElement(
type="table",
content=table,
page=page_num
))
return elements
except Exception as e:
print(f"PyMuPDF failed: {e}, falling back to pdfplumber")
# Fallback to pdfplumber only
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text() or ""
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
if line.isupper() and len(line) < 100:
element_type = "heading"
level = 1
elif re.match(r'^[\d\-\β€’\*]+\.?\s+', line):
element_type = "list"
level = None
else:
element_type = "paragraph"
level = None
elements.append(PDFElement(
type=element_type,
content=line,
page=page_num,
level=level
))
# Extract tables
tables = page.extract_tables()
for table in tables:
if table:
elements.append(PDFElement(
type="table",
content=table,
page=page_num
))
return elements
def _convert_to_html(self) -> str:
"""Convert elements to HTML with IDs and styling"""
html_parts = ['''
<style>
.pdf-content {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
line-height: 1.8;
color: #333;
max-width: 100%;
padding: 20px;
}
.pdf-content h1,
.pdf-content h2,
.pdf-content h3 {
color: #2c3e50;
margin: 25px 0 15px 0;
font-weight: 600;
}
.pdf-content h1 { font-size: 2em; border-bottom: 3px solid #667eea; padding-bottom: 10px; }
.pdf-content h2 { font-size: 1.6em; border-bottom: 2px solid #e0e0e0; padding-bottom: 8px; }
.pdf-content h3 { font-size: 1.3em; }
.pdf-content table {
border-collapse: collapse;
width: 100%;
margin: 20px 0;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
border-radius: 8px;
overflow: hidden;
}
.pdf-content th,
.pdf-content td {
border: 1px solid #e0e0e0;
padding: 12px 15px;
text-align: left;
}
.pdf-content th {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
font-weight: 600;
text-transform: uppercase;
font-size: 0.9em;
letter-spacing: 0.5px;
}
.pdf-content tr:nth-child(even) {
background-color: #f8f9fa;
}
.pdf-content tr:hover {
background-color: #e3f2fd;
transition: background-color 0.2s;
}
.pdf-content p {
margin: 12px 0;
text-align: justify;
}
.pdf-content li {
margin: 8px 0;
margin-left: 25px;
}
.pdf-content .page-marker {
color: #666;
font-size: 0.95em;
font-weight: 600;
margin: 40px 0 20px 0;
padding: 12px 20px;
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
border-left: 5px solid #667eea;
border-radius: 4px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.pdf-content ul, .pdf-content ol {
margin: 15px 0;
padding-left: 30px;
}
</style>
<div class="pdf-content">
''']
current_page = 0
in_list = False
for elem in self.elements:
# Add page marker
if elem.page != current_page:
if in_list:
html_parts.append('</ul>')
in_list = False
current_page = elem.page
html_parts.append(f'<div class="page-marker" id="page-{current_page}">πŸ“„ Page {current_page}</div>')
if elem.type == "heading":
if in_list:
html_parts.append('</ul>')
in_list = False
level = elem.level or 2
elem_id = self._get_element_id('heading')
content = html.escape(elem.content)
html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>')
elif elem.type == "paragraph":
if in_list:
html_parts.append('</ul>')
in_list = False
elem_id = self._get_element_id('paragraph')
content = html.escape(elem.content)
html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>')
elif elem.type == "list":
if not in_list:
html_parts.append('<ul>')
in_list = True
elem_id = self._get_element_id('list-item')
content = html.escape(elem.content)
html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>')
elif elem.type == "table":
if in_list:
html_parts.append('</ul>')
in_list = False
elem_id = self._get_element_id('table')
html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">')
for i, row in enumerate(elem.content):
row_id = self._get_element_id('table-row')
html_parts.append(f'<tr id="{row_id}">')
tag = 'th' if i == 0 else 'td'
for j, cell in enumerate(row):
cell_id = self._get_element_id('table-cell')
cell_content = html.escape(str(cell)) if cell else ""
html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>')
html_parts.append('</tr>')
html_parts.append('</table>')
if in_list:
html_parts.append('</ul>')
html_parts.append('</div>')
return '\n'.join(html_parts)
# Global processor
processor = PDFProcessor()
def process_pdf_url(pdf_url):
"""Process PDF from URL"""
global processor
if not pdf_url or not pdf_url.strip():
return "❌ Please enter a PDF URL", "", ""
try:
processor = PDFProcessor()
summary = processor.process_pdf(pdf_url.strip())
summary_text = f"""### βœ… PDF Processed Successfully!
**πŸ“Š Summary:**
- **Total Elements:** {summary['total_elements']}
- **Pages:** {summary['pages']}
- **Headings:** {summary['headings']}
- **Tables:** {summary['tables']}
- **Paragraphs:** {summary['paragraphs']}
"""
return summary_text, processor.html_content, processor.html_content
except Exception as e:
error_msg = f"❌ Error processing PDF: {str(e)}"
return error_msg, "", ""
def create_download_file(html_content):
if not html_content:
return None
# Create full HTML document
full_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Extracted PDF Content</title>
</head>
<body>
{html_content}
</body>
</html>"""
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8')
temp_file.write(full_html)
temp_file.close()
return temp_file.name
# Create Gradio interface
with gr.Blocks(title="PDF to HTML Converter") as demo:
gr.Markdown(
"""
# πŸ“„ PDF to HTML Converter
Extract PDF content and view as beautifully structured HTML with unique IDs for each element.
Simply paste a PDF URL and click **Process PDF** to get started!
"""
)
with gr.Row():
with gr.Column(scale=4):
pdf_url_input = gr.Textbox(
label="PDF URL",
placeholder="https://example.com/document.pdf"
)
with gr.Column(scale=1):
process_btn = gr.Button("πŸš€ Process PDF", variant="primary")
summary_output = gr.Markdown(label="Summary")
gr.Markdown("---")
with gr.Tabs():
with gr.Tab("πŸ“‹ HTML Preview"):
html_preview = gr.HTML(label="Rendered HTML")
with gr.Tab("πŸ’» HTML Source"):
html_source = gr.Code(
label="HTML Source Code",
language="html"
)
download_btn = gr.Button("πŸ“₯ Download HTML")
download_file = gr.File(label="Download", visible=False)
# Event handlers
process_btn.click(
fn=process_pdf_url,
inputs=[pdf_url_input],
outputs=[summary_output, html_preview, html_source]
)
# Allow Enter key to process
pdf_url_input.submit(
fn=process_pdf_url,
inputs=[pdf_url_input],
outputs=[summary_output, html_preview, html_source]
)
download_btn.click(
fn=create_download_file,
inputs=[html_source],
outputs=[download_file]
)
gr.Markdown(
"""
---
### πŸ“Œ Features:
- ✨ Extracts text, tables, headings from PDFs
- 🎯 Each HTML element has a unique ID
- πŸ“Š Beautiful table styling
- πŸ”– Page markers for easy navigation
- πŸ’Ύ Download extracted HTML
### πŸ’‘ Example PDFs to try:
- Research papers from arXiv
- Product documentation
- Financial reports
- Any publicly accessible PDF!
"""
)
# Launch
if __name__ == "__main__":
demo.launch()