import gradio as gr
import pdfplumber
import re
import requests
import tempfile
import os
from typing import List, Dict, Any
import html
try:
import fitz # PyMuPDF
PYMUPDF_AVAILABLE = True
except ImportError:
PYMUPDF_AVAILABLE = False
print("PyMuPDF not available, using pdfplumber only")
from dataclasses import dataclass
@dataclass
class PDFElement:
"""Represents an element extracted from PDF"""
type: str
content: Any
page: int
bbox: tuple = None
style: Dict = None
level: int = None
class PDFProcessor:
"""Simplified PDF processor"""
def __init__(self):
self.elements = []
self.html_content = ""
self.element_counter = 0
def process_pdf(self, pdf_url: str) -> Dict:
"""Process PDF from URL"""
temp_file = None
try:
temp_file = self._download_pdf(pdf_url)
# Extract content
self.elements = self._extract_content(temp_file)
self.html_content = self._convert_to_html()
# Get summary
summary = {
'total_elements': len(self.elements),
'pages': max([e.page for e in self.elements]) if self.elements else 0,
'headings': len([e for e in self.elements if e.type == 'heading']),
'tables': len([e for e in self.elements if e.type == 'table']),
'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
}
return summary
finally:
if temp_file and os.path.exists(temp_file):
try:
os.unlink(temp_file)
except:
pass
def _download_pdf(self, url: str) -> str:
"""Download PDF from URL"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
temp_file.write(response.content)
temp_file.close()
return temp_file.name
def _get_element_id(self, element_type: str) -> str:
"""Generate unique ID for element"""
self.element_counter += 1
return f"{element_type}-{self.element_counter}"
def _extract_content(self, pdf_path: str) -> List[PDFElement]:
"""Extract structured content from PDF"""
elements = []
if PYMUPDF_AVAILABLE:
try:
# Try PyMuPDF first for better structure detection
doc = fitz.open(pdf_path)
for page_num, page in enumerate(doc, 1):
blocks = page.get_text("dict")
for block in blocks["blocks"]:
if block["type"] == 0: # Text block
for line in block["lines"]:
for span in line["spans"]:
text = span["text"].strip()
if not text:
continue
font_size = span["size"]
# Simple classification
if font_size > 14:
element_type = "heading"
level = 1 if font_size > 18 else 2
elif re.match(r'^[\d\-\•\*]+\.?\s+', text):
element_type = "list"
level = None
else:
element_type = "paragraph"
level = None
elements.append(PDFElement(
type=element_type,
content=text,
page=page_num,
level=level
))
doc.close()
# Also get tables with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
tables = page.extract_tables()
for table in tables:
if table:
elements.append(PDFElement(
type="table",
content=table,
page=page_num
))
return elements
except Exception as e:
print(f"PyMuPDF failed: {e}, falling back to pdfplumber")
# Fallback to pdfplumber only
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
text = page.extract_text() or ""
lines = text.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
if line.isupper() and len(line) < 100:
element_type = "heading"
level = 1
elif re.match(r'^[\d\-\•\*]+\.?\s+', line):
element_type = "list"
level = None
else:
element_type = "paragraph"
level = None
elements.append(PDFElement(
type=element_type,
content=line,
page=page_num,
level=level
))
# Extract tables
tables = page.extract_tables()
for table in tables:
if table:
elements.append(PDFElement(
type="table",
content=table,
page=page_num
))
return elements
def _convert_to_html(self) -> str:
"""Convert elements to HTML with IDs and styling"""
html_parts = ['''
''']
current_page = 0
in_list = False
for elem in self.elements:
# Add page marker
if elem.page != current_page:
if in_list:
html_parts.append('')
in_list = False
current_page = elem.page
html_parts.append(f'
📄 Page {current_page}
')
if elem.type == "heading":
if in_list:
html_parts.append('')
in_list = False
level = elem.level or 2
elem_id = self._get_element_id('heading')
content = html.escape(elem.content)
html_parts.append(f'
{content}')
elif elem.type == "paragraph":
if in_list:
html_parts.append('')
in_list = False
elem_id = self._get_element_id('paragraph')
content = html.escape(elem.content)
html_parts.append(f'
{content}
')
elif elem.type == "list":
if not in_list:
html_parts.append('
')
in_list = True
elem_id = self._get_element_id('list-item')
content = html.escape(elem.content)
html_parts.append(f'- {content}
')
elif elem.type == "table":
if in_list:
html_parts.append('
')
in_list = False
elem_id = self._get_element_id('table')
html_parts.append(f'
')
for i, row in enumerate(elem.content):
row_id = self._get_element_id('table-row')
html_parts.append(f'')
tag = 'th' if i == 0 else 'td'
for j, cell in enumerate(row):
cell_id = self._get_element_id('table-cell')
cell_content = html.escape(str(cell)) if cell else ""
html_parts.append(f'<{tag} id="{cell_id}">{cell_content}{tag}>')
html_parts.append('
')
html_parts.append('
')
if in_list:
html_parts.append('')
html_parts.append('
')
return '\n'.join(html_parts)
# Global processor
processor = PDFProcessor()
def process_pdf_url(pdf_url):
"""Process PDF from URL"""
global processor
if not pdf_url or not pdf_url.strip():
return "❌ Please enter a PDF URL", "", ""
try:
processor = PDFProcessor()
summary = processor.process_pdf(pdf_url.strip())
summary_text = f"""### ✅ PDF Processed Successfully!
**📊 Summary:**
- **Total Elements:** {summary['total_elements']}
- **Pages:** {summary['pages']}
- **Headings:** {summary['headings']}
- **Tables:** {summary['tables']}
- **Paragraphs:** {summary['paragraphs']}
"""
return summary_text, processor.html_content, processor.html_content
except Exception as e:
error_msg = f"❌ Error processing PDF: {str(e)}"
return error_msg, "", ""
def create_download_file(html_content):
if not html_content:
return None
# Create full HTML document
full_html = f"""
Extracted PDF Content
{html_content}
"""
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8')
temp_file.write(full_html)
temp_file.close()
return temp_file.name
# Create Gradio interface
with gr.Blocks(title="PDF to HTML Converter") as demo:
gr.Markdown(
"""
# 📄 PDF to HTML Converter
Extract PDF content and view as beautifully structured HTML with unique IDs for each element.
Simply paste a PDF URL and click **Process PDF** to get started!
"""
)
with gr.Row():
with gr.Column(scale=4):
pdf_url_input = gr.Textbox(
label="PDF URL",
placeholder="https://example.com/document.pdf"
)
with gr.Column(scale=1):
process_btn = gr.Button("🚀 Process PDF", variant="primary")
summary_output = gr.Markdown(label="Summary")
gr.Markdown("---")
with gr.Tabs():
with gr.Tab("📋 HTML Preview"):
html_preview = gr.HTML(label="Rendered HTML")
with gr.Tab("💻 HTML Source"):
html_source = gr.Code(
label="HTML Source Code",
language="html"
)
download_btn = gr.Button("📥 Download HTML")
download_file = gr.File(label="Download", visible=False)
# Event handlers
process_btn.click(
fn=process_pdf_url,
inputs=[pdf_url_input],
outputs=[summary_output, html_preview, html_source]
)
# Allow Enter key to process
pdf_url_input.submit(
fn=process_pdf_url,
inputs=[pdf_url_input],
outputs=[summary_output, html_preview, html_source]
)
download_btn.click(
fn=create_download_file,
inputs=[html_source],
outputs=[download_file]
)
gr.Markdown(
"""
---
### 📌 Features:
- ✨ Extracts text, tables, headings from PDFs
- 🎯 Each HTML element has a unique ID
- 📊 Beautiful table styling
- 🔖 Page markers for easy navigation
- 💾 Download extracted HTML
### 💡 Example PDFs to try:
- Research papers from arXiv
- Product documentation
- Financial reports
- Any publicly accessible PDF!
"""
)
# Launch
if __name__ == "__main__":
demo.launch()