|
|
import gradio as gr |
|
|
import pdfplumber |
|
|
import re |
|
|
import requests |
|
|
import tempfile |
|
|
import os |
|
|
from typing import List, Dict, Any |
|
|
import html |
|
|
|
|
|
try: |
|
|
import fitz |
|
|
PYMUPDF_AVAILABLE = True |
|
|
except ImportError: |
|
|
PYMUPDF_AVAILABLE = False |
|
|
print("PyMuPDF not available, using pdfplumber only") |
|
|
|
|
|
from dataclasses import dataclass |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class PDFElement: |
|
|
"""Represents an element extracted from PDF""" |
|
|
type: str |
|
|
content: Any |
|
|
page: int |
|
|
bbox: tuple = None |
|
|
style: Dict = None |
|
|
level: int = None |
|
|
|
|
|
|
|
|
class PDFProcessor: |
|
|
"""Simplified PDF processor""" |
|
|
|
|
|
def __init__(self): |
|
|
self.elements = [] |
|
|
self.html_content = "" |
|
|
self.element_counter = 0 |
|
|
|
|
|
def process_pdf(self, pdf_url: str) -> Dict: |
|
|
"""Process PDF from URL""" |
|
|
temp_file = None |
|
|
|
|
|
try: |
|
|
temp_file = self._download_pdf(pdf_url) |
|
|
|
|
|
|
|
|
self.elements = self._extract_content(temp_file) |
|
|
self.html_content = self._convert_to_html() |
|
|
|
|
|
|
|
|
summary = { |
|
|
'total_elements': len(self.elements), |
|
|
'pages': max([e.page for e in self.elements]) if self.elements else 0, |
|
|
'headings': len([e for e in self.elements if e.type == 'heading']), |
|
|
'tables': len([e for e in self.elements if e.type == 'table']), |
|
|
'paragraphs': len([e for e in self.elements if e.type == 'paragraph']) |
|
|
} |
|
|
|
|
|
return summary |
|
|
|
|
|
finally: |
|
|
if temp_file and os.path.exists(temp_file): |
|
|
try: |
|
|
os.unlink(temp_file) |
|
|
except: |
|
|
pass |
|
|
|
|
|
def _download_pdf(self, url: str) -> str: |
|
|
"""Download PDF from URL""" |
|
|
headers = { |
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
|
|
} |
|
|
|
|
|
response = requests.get(url, headers=headers, timeout=30) |
|
|
response.raise_for_status() |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') |
|
|
temp_file.write(response.content) |
|
|
temp_file.close() |
|
|
|
|
|
return temp_file.name |
|
|
|
|
|
def _get_element_id(self, element_type: str) -> str: |
|
|
"""Generate unique ID for element""" |
|
|
self.element_counter += 1 |
|
|
return f"{element_type}-{self.element_counter}" |
|
|
|
|
|
def _extract_content(self, pdf_path: str) -> List[PDFElement]: |
|
|
"""Extract structured content from PDF""" |
|
|
elements = [] |
|
|
|
|
|
if PYMUPDF_AVAILABLE: |
|
|
try: |
|
|
|
|
|
doc = fitz.open(pdf_path) |
|
|
|
|
|
for page_num, page in enumerate(doc, 1): |
|
|
blocks = page.get_text("dict") |
|
|
|
|
|
for block in blocks["blocks"]: |
|
|
if block["type"] == 0: |
|
|
for line in block["lines"]: |
|
|
for span in line["spans"]: |
|
|
text = span["text"].strip() |
|
|
if not text: |
|
|
continue |
|
|
|
|
|
font_size = span["size"] |
|
|
|
|
|
|
|
|
if font_size > 14: |
|
|
element_type = "heading" |
|
|
level = 1 if font_size > 18 else 2 |
|
|
elif re.match(r'^[\d\-\β’\*]+\.?\s+', text): |
|
|
element_type = "list" |
|
|
level = None |
|
|
else: |
|
|
element_type = "paragraph" |
|
|
level = None |
|
|
|
|
|
elements.append(PDFElement( |
|
|
type=element_type, |
|
|
content=text, |
|
|
page=page_num, |
|
|
level=level |
|
|
)) |
|
|
|
|
|
doc.close() |
|
|
|
|
|
|
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
for page_num, page in enumerate(pdf.pages, 1): |
|
|
tables = page.extract_tables() |
|
|
for table in tables: |
|
|
if table: |
|
|
elements.append(PDFElement( |
|
|
type="table", |
|
|
content=table, |
|
|
page=page_num |
|
|
)) |
|
|
|
|
|
return elements |
|
|
|
|
|
except Exception as e: |
|
|
print(f"PyMuPDF failed: {e}, falling back to pdfplumber") |
|
|
|
|
|
|
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
|
for page_num, page in enumerate(pdf.pages, 1): |
|
|
text = page.extract_text() or "" |
|
|
lines = text.split('\n') |
|
|
|
|
|
for line in lines: |
|
|
line = line.strip() |
|
|
if not line: |
|
|
continue |
|
|
|
|
|
if line.isupper() and len(line) < 100: |
|
|
element_type = "heading" |
|
|
level = 1 |
|
|
elif re.match(r'^[\d\-\β’\*]+\.?\s+', line): |
|
|
element_type = "list" |
|
|
level = None |
|
|
else: |
|
|
element_type = "paragraph" |
|
|
level = None |
|
|
|
|
|
elements.append(PDFElement( |
|
|
type=element_type, |
|
|
content=line, |
|
|
page=page_num, |
|
|
level=level |
|
|
)) |
|
|
|
|
|
|
|
|
tables = page.extract_tables() |
|
|
for table in tables: |
|
|
if table: |
|
|
elements.append(PDFElement( |
|
|
type="table", |
|
|
content=table, |
|
|
page=page_num |
|
|
)) |
|
|
|
|
|
return elements |
|
|
|
|
|
def _convert_to_html(self) -> str: |
|
|
"""Convert elements to HTML with IDs and styling""" |
|
|
html_parts = [''' |
|
|
<style> |
|
|
.pdf-content { |
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif; |
|
|
line-height: 1.8; |
|
|
color: #333; |
|
|
max-width: 100%; |
|
|
padding: 20px; |
|
|
} |
|
|
.pdf-content h1, |
|
|
.pdf-content h2, |
|
|
.pdf-content h3 { |
|
|
color: #2c3e50; |
|
|
margin: 25px 0 15px 0; |
|
|
font-weight: 600; |
|
|
} |
|
|
.pdf-content h1 { font-size: 2em; border-bottom: 3px solid #667eea; padding-bottom: 10px; } |
|
|
.pdf-content h2 { font-size: 1.6em; border-bottom: 2px solid #e0e0e0; padding-bottom: 8px; } |
|
|
.pdf-content h3 { font-size: 1.3em; } |
|
|
.pdf-content table { |
|
|
border-collapse: collapse; |
|
|
width: 100%; |
|
|
margin: 20px 0; |
|
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
|
|
border-radius: 8px; |
|
|
overflow: hidden; |
|
|
} |
|
|
.pdf-content th, |
|
|
.pdf-content td { |
|
|
border: 1px solid #e0e0e0; |
|
|
padding: 12px 15px; |
|
|
text-align: left; |
|
|
} |
|
|
.pdf-content th { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
color: white; |
|
|
font-weight: 600; |
|
|
text-transform: uppercase; |
|
|
font-size: 0.9em; |
|
|
letter-spacing: 0.5px; |
|
|
} |
|
|
.pdf-content tr:nth-child(even) { |
|
|
background-color: #f8f9fa; |
|
|
} |
|
|
.pdf-content tr:hover { |
|
|
background-color: #e3f2fd; |
|
|
transition: background-color 0.2s; |
|
|
} |
|
|
.pdf-content p { |
|
|
margin: 12px 0; |
|
|
text-align: justify; |
|
|
} |
|
|
.pdf-content li { |
|
|
margin: 8px 0; |
|
|
margin-left: 25px; |
|
|
} |
|
|
.pdf-content .page-marker { |
|
|
color: #666; |
|
|
font-size: 0.95em; |
|
|
font-weight: 600; |
|
|
margin: 40px 0 20px 0; |
|
|
padding: 12px 20px; |
|
|
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); |
|
|
border-left: 5px solid #667eea; |
|
|
border-radius: 4px; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
|
} |
|
|
.pdf-content ul, .pdf-content ol { |
|
|
margin: 15px 0; |
|
|
padding-left: 30px; |
|
|
} |
|
|
</style> |
|
|
<div class="pdf-content"> |
|
|
'''] |
|
|
|
|
|
current_page = 0 |
|
|
in_list = False |
|
|
|
|
|
for elem in self.elements: |
|
|
|
|
|
if elem.page != current_page: |
|
|
if in_list: |
|
|
html_parts.append('</ul>') |
|
|
in_list = False |
|
|
current_page = elem.page |
|
|
html_parts.append(f'<div class="page-marker" id="page-{current_page}">π Page {current_page}</div>') |
|
|
|
|
|
if elem.type == "heading": |
|
|
if in_list: |
|
|
html_parts.append('</ul>') |
|
|
in_list = False |
|
|
level = elem.level or 2 |
|
|
elem_id = self._get_element_id('heading') |
|
|
content = html.escape(elem.content) |
|
|
html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>') |
|
|
|
|
|
elif elem.type == "paragraph": |
|
|
if in_list: |
|
|
html_parts.append('</ul>') |
|
|
in_list = False |
|
|
elem_id = self._get_element_id('paragraph') |
|
|
content = html.escape(elem.content) |
|
|
html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>') |
|
|
|
|
|
elif elem.type == "list": |
|
|
if not in_list: |
|
|
html_parts.append('<ul>') |
|
|
in_list = True |
|
|
elem_id = self._get_element_id('list-item') |
|
|
content = html.escape(elem.content) |
|
|
html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>') |
|
|
|
|
|
elif elem.type == "table": |
|
|
if in_list: |
|
|
html_parts.append('</ul>') |
|
|
in_list = False |
|
|
elem_id = self._get_element_id('table') |
|
|
html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">') |
|
|
for i, row in enumerate(elem.content): |
|
|
row_id = self._get_element_id('table-row') |
|
|
html_parts.append(f'<tr id="{row_id}">') |
|
|
tag = 'th' if i == 0 else 'td' |
|
|
for j, cell in enumerate(row): |
|
|
cell_id = self._get_element_id('table-cell') |
|
|
cell_content = html.escape(str(cell)) if cell else "" |
|
|
html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>') |
|
|
html_parts.append('</tr>') |
|
|
html_parts.append('</table>') |
|
|
|
|
|
if in_list: |
|
|
html_parts.append('</ul>') |
|
|
|
|
|
html_parts.append('</div>') |
|
|
return '\n'.join(html_parts) |
|
|
|
|
|
|
|
|
|
|
|
processor = PDFProcessor() |
|
|
|
|
|
|
|
|
def process_pdf_url(pdf_url): |
|
|
"""Process PDF from URL""" |
|
|
global processor |
|
|
|
|
|
if not pdf_url or not pdf_url.strip(): |
|
|
return "β Please enter a PDF URL", "", "" |
|
|
|
|
|
try: |
|
|
processor = PDFProcessor() |
|
|
summary = processor.process_pdf(pdf_url.strip()) |
|
|
|
|
|
summary_text = f"""### β
PDF Processed Successfully! |
|
|
|
|
|
**π Summary:** |
|
|
- **Total Elements:** {summary['total_elements']} |
|
|
- **Pages:** {summary['pages']} |
|
|
- **Headings:** {summary['headings']} |
|
|
- **Tables:** {summary['tables']} |
|
|
- **Paragraphs:** {summary['paragraphs']} |
|
|
""" |
|
|
|
|
|
return summary_text, processor.html_content, processor.html_content |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"β Error processing PDF: {str(e)}" |
|
|
return error_msg, "", "" |
|
|
|
|
|
|
|
|
def create_download_file(html_content): |
|
|
if not html_content: |
|
|
return None |
|
|
|
|
|
|
|
|
full_html = f"""<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>Extracted PDF Content</title> |
|
|
</head> |
|
|
<body> |
|
|
{html_content} |
|
|
</body> |
|
|
</html>""" |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8') |
|
|
temp_file.write(full_html) |
|
|
temp_file.close() |
|
|
return temp_file.name |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="PDF to HTML Converter") as demo: |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# π PDF to HTML Converter |
|
|
|
|
|
Extract PDF content and view as beautifully structured HTML with unique IDs for each element. |
|
|
|
|
|
Simply paste a PDF URL and click **Process PDF** to get started! |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=4): |
|
|
pdf_url_input = gr.Textbox( |
|
|
label="PDF URL", |
|
|
placeholder="https://example.com/document.pdf" |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
process_btn = gr.Button("π Process PDF", variant="primary") |
|
|
|
|
|
summary_output = gr.Markdown(label="Summary") |
|
|
|
|
|
gr.Markdown("---") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("π HTML Preview"): |
|
|
html_preview = gr.HTML(label="Rendered HTML") |
|
|
|
|
|
with gr.Tab("π» HTML Source"): |
|
|
html_source = gr.Code( |
|
|
label="HTML Source Code", |
|
|
language="html" |
|
|
) |
|
|
download_btn = gr.Button("π₯ Download HTML") |
|
|
download_file = gr.File(label="Download", visible=False) |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=process_pdf_url, |
|
|
inputs=[pdf_url_input], |
|
|
outputs=[summary_output, html_preview, html_source] |
|
|
) |
|
|
|
|
|
|
|
|
pdf_url_input.submit( |
|
|
fn=process_pdf_url, |
|
|
inputs=[pdf_url_input], |
|
|
outputs=[summary_output, html_preview, html_source] |
|
|
) |
|
|
|
|
|
download_btn.click( |
|
|
fn=create_download_file, |
|
|
inputs=[html_source], |
|
|
outputs=[download_file] |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
### π Features: |
|
|
- β¨ Extracts text, tables, headings from PDFs |
|
|
- π― Each HTML element has a unique ID |
|
|
- π Beautiful table styling |
|
|
- π Page markers for easy navigation |
|
|
- πΎ Download extracted HTML |
|
|
|
|
|
### π‘ Example PDFs to try: |
|
|
- Research papers from arXiv |
|
|
- Product documentation |
|
|
- Financial reports |
|
|
- Any publicly accessible PDF! |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |