import gradio as gr import pdfplumber import re import requests import tempfile import os from typing import List, Dict, Any import html try: import fitz # PyMuPDF PYMUPDF_AVAILABLE = True except ImportError: PYMUPDF_AVAILABLE = False print("PyMuPDF not available, using pdfplumber only") from dataclasses import dataclass @dataclass class PDFElement: """Represents an element extracted from PDF""" type: str content: Any page: int bbox: tuple = None style: Dict = None level: int = None class PDFProcessor: """Simplified PDF processor""" def __init__(self): self.elements = [] self.html_content = "" self.element_counter = 0 def process_pdf(self, pdf_url: str) -> Dict: """Process PDF from URL""" temp_file = None try: temp_file = self._download_pdf(pdf_url) # Extract content self.elements = self._extract_content(temp_file) self.html_content = self._convert_to_html() # Get summary summary = { 'total_elements': len(self.elements), 'pages': max([e.page for e in self.elements]) if self.elements else 0, 'headings': len([e for e in self.elements if e.type == 'heading']), 'tables': len([e for e in self.elements if e.type == 'table']), 'paragraphs': len([e for e in self.elements if e.type == 'paragraph']) } return summary finally: if temp_file and os.path.exists(temp_file): try: os.unlink(temp_file) except: pass def _download_pdf(self, url: str) -> str: """Download PDF from URL""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') temp_file.write(response.content) temp_file.close() return temp_file.name def _get_element_id(self, element_type: str) -> str: """Generate unique ID for element""" self.element_counter += 1 return f"{element_type}-{self.element_counter}" def _extract_content(self, pdf_path: str) -> List[PDFElement]: """Extract structured content from PDF""" elements = [] if PYMUPDF_AVAILABLE: try: # Try PyMuPDF first for better structure detection doc = fitz.open(pdf_path) for page_num, page in enumerate(doc, 1): blocks = page.get_text("dict") for block in blocks["blocks"]: if block["type"] == 0: # Text block for line in block["lines"]: for span in line["spans"]: text = span["text"].strip() if not text: continue font_size = span["size"] # Simple classification if font_size > 14: element_type = "heading" level = 1 if font_size > 18 else 2 elif re.match(r'^[\d\-\•\*]+\.?\s+', text): element_type = "list" level = None else: element_type = "paragraph" level = None elements.append(PDFElement( type=element_type, content=text, page=page_num, level=level )) doc.close() # Also get tables with pdfplumber with pdfplumber.open(pdf_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): tables = page.extract_tables() for table in tables: if table: elements.append(PDFElement( type="table", content=table, page=page_num )) return elements except Exception as e: print(f"PyMuPDF failed: {e}, falling back to pdfplumber") # Fallback to pdfplumber only with pdfplumber.open(pdf_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() or "" lines = text.split('\n') for line in lines: line = line.strip() if not line: continue if line.isupper() and len(line) < 100: element_type = "heading" level = 1 elif re.match(r'^[\d\-\•\*]+\.?\s+', line): element_type = "list" level = None else: element_type = "paragraph" level = None elements.append(PDFElement( type=element_type, content=line, page=page_num, level=level )) # Extract tables tables = page.extract_tables() for table in tables: if table: elements.append(PDFElement( type="table", content=table, page=page_num )) return elements def _convert_to_html(self) -> str: """Convert elements to HTML with IDs and styling""" html_parts = ['''
'''] current_page = 0 in_list = False for elem in self.elements: # Add page marker if elem.page != current_page: if in_list: html_parts.append('') in_list = False current_page = elem.page html_parts.append(f'
📄 Page {current_page}
') if elem.type == "heading": if in_list: html_parts.append('') in_list = False level = elem.level or 2 elem_id = self._get_element_id('heading') content = html.escape(elem.content) html_parts.append(f'{content}') elif elem.type == "paragraph": if in_list: html_parts.append('') in_list = False elem_id = self._get_element_id('paragraph') content = html.escape(elem.content) html_parts.append(f'

{content}

') elif elem.type == "list": if not in_list: html_parts.append('') in_list = False elem_id = self._get_element_id('table') html_parts.append(f'') for i, row in enumerate(elem.content): row_id = self._get_element_id('table-row') html_parts.append(f'') tag = 'th' if i == 0 else 'td' for j, cell in enumerate(row): cell_id = self._get_element_id('table-cell') cell_content = html.escape(str(cell)) if cell else "" html_parts.append(f'<{tag} id="{cell_id}">{cell_content}') html_parts.append('') html_parts.append('
') if in_list: html_parts.append('') html_parts.append('
') return '\n'.join(html_parts) # Global processor processor = PDFProcessor() def process_pdf_url(pdf_url): """Process PDF from URL""" global processor if not pdf_url or not pdf_url.strip(): return "❌ Please enter a PDF URL", "", "" try: processor = PDFProcessor() summary = processor.process_pdf(pdf_url.strip()) summary_text = f"""### ✅ PDF Processed Successfully! **📊 Summary:** - **Total Elements:** {summary['total_elements']} - **Pages:** {summary['pages']} - **Headings:** {summary['headings']} - **Tables:** {summary['tables']} - **Paragraphs:** {summary['paragraphs']} """ return summary_text, processor.html_content, processor.html_content except Exception as e: error_msg = f"❌ Error processing PDF: {str(e)}" return error_msg, "", "" def create_download_file(html_content): if not html_content: return None # Create full HTML document full_html = f""" Extracted PDF Content {html_content} """ temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8') temp_file.write(full_html) temp_file.close() return temp_file.name # Create Gradio interface with gr.Blocks(title="PDF to HTML Converter") as demo: gr.Markdown( """ # 📄 PDF to HTML Converter Extract PDF content and view as beautifully structured HTML with unique IDs for each element. Simply paste a PDF URL and click **Process PDF** to get started! """ ) with gr.Row(): with gr.Column(scale=4): pdf_url_input = gr.Textbox( label="PDF URL", placeholder="https://example.com/document.pdf" ) with gr.Column(scale=1): process_btn = gr.Button("🚀 Process PDF", variant="primary") summary_output = gr.Markdown(label="Summary") gr.Markdown("---") with gr.Tabs(): with gr.Tab("📋 HTML Preview"): html_preview = gr.HTML(label="Rendered HTML") with gr.Tab("💻 HTML Source"): html_source = gr.Code( label="HTML Source Code", language="html" ) download_btn = gr.Button("📥 Download HTML") download_file = gr.File(label="Download", visible=False) # Event handlers process_btn.click( fn=process_pdf_url, inputs=[pdf_url_input], outputs=[summary_output, html_preview, html_source] ) # Allow Enter key to process pdf_url_input.submit( fn=process_pdf_url, inputs=[pdf_url_input], outputs=[summary_output, html_preview, html_source] ) download_btn.click( fn=create_download_file, inputs=[html_source], outputs=[download_file] ) gr.Markdown( """ --- ### 📌 Features: - ✨ Extracts text, tables, headings from PDFs - 🎯 Each HTML element has a unique ID - 📊 Beautiful table styling - 🔖 Page markers for easy navigation - 💾 Download extracted HTML ### 💡 Example PDFs to try: - Research papers from arXiv - Product documentation - Financial reports - Any publicly accessible PDF! """ ) # Launch if __name__ == "__main__": demo.launch()