Spaces:

ShayanRl
/

pdf_F

Sleeping

File size: 15,221 Bytes

import gradio as gr
import pdfplumber
import re
import requests
import tempfile
import os
from typing import List, Dict, Any
import html

try:
    import fitz  # PyMuPDF
    PYMUPDF_AVAILABLE = True
except ImportError:
    PYMUPDF_AVAILABLE = False
    print("PyMuPDF not available, using pdfplumber only")

from dataclasses import dataclass


@dataclass
class PDFElement:
    """Represents an element extracted from PDF"""
    type: str
    content: Any
    page: int
    bbox: tuple = None
    style: Dict = None
    level: int = None


class PDFProcessor:
    """Simplified PDF processor"""
    
    def __init__(self):
        self.elements = []
        self.html_content = ""
        self.element_counter = 0
        
    def process_pdf(self, pdf_url: str) -> Dict:
        """Process PDF from URL"""
        temp_file = None
        
        try:
            temp_file = self._download_pdf(pdf_url)
            
            # Extract content
            self.elements = self._extract_content(temp_file)
            self.html_content = self._convert_to_html()
            
            # Get summary
            summary = {
                'total_elements': len(self.elements),
                'pages': max([e.page for e in self.elements]) if self.elements else 0,
                'headings': len([e for e in self.elements if e.type == 'heading']),
                'tables': len([e for e in self.elements if e.type == 'table']),
                'paragraphs': len([e for e in self.elements if e.type == 'paragraph'])
            }
            
            return summary
            
        finally:
            if temp_file and os.path.exists(temp_file):
                try:
                    os.unlink(temp_file)
                except:
                    pass
    
    def _download_pdf(self, url: str) -> str:
        """Download PDF from URL"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf')
        temp_file.write(response.content)
        temp_file.close()
        
        return temp_file.name
    
    def _get_element_id(self, element_type: str) -> str:
        """Generate unique ID for element"""
        self.element_counter += 1
        return f"{element_type}-{self.element_counter}"
    
    def _extract_content(self, pdf_path: str) -> List[PDFElement]:
        """Extract structured content from PDF"""
        elements = []
        
        if PYMUPDF_AVAILABLE:
            try:
                # Try PyMuPDF first for better structure detection
                doc = fitz.open(pdf_path)
                
                for page_num, page in enumerate(doc, 1):
                    blocks = page.get_text("dict")
                    
                    for block in blocks["blocks"]:
                        if block["type"] == 0:  # Text block
                            for line in block["lines"]:
                                for span in line["spans"]:
                                    text = span["text"].strip()
                                    if not text:
                                        continue
                                    
                                    font_size = span["size"]
                                    
                                    # Simple classification
                                    if font_size > 14:
                                        element_type = "heading"
                                        level = 1 if font_size > 18 else 2
                                    elif re.match(r'^[\d\-\•\*]+\.?\s+', text):
                                        element_type = "list"
                                        level = None
                                    else:
                                        element_type = "paragraph"
                                        level = None
                                    
                                    elements.append(PDFElement(
                                        type=element_type,
                                        content=text,
                                        page=page_num,
                                        level=level
                                    ))
                
                doc.close()
                
                # Also get tables with pdfplumber
                with pdfplumber.open(pdf_path) as pdf:
                    for page_num, page in enumerate(pdf.pages, 1):
                        tables = page.extract_tables()
                        for table in tables:
                            if table:
                                elements.append(PDFElement(
                                    type="table",
                                    content=table,
                                    page=page_num
                                ))
                
                return elements
                
            except Exception as e:
                print(f"PyMuPDF failed: {e}, falling back to pdfplumber")
        
        # Fallback to pdfplumber only
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                text = page.extract_text() or ""
                lines = text.split('\n')
                
                for line in lines:
                    line = line.strip()
                    if not line:
                        continue
                    
                    if line.isupper() and len(line) < 100:
                        element_type = "heading"
                        level = 1
                    elif re.match(r'^[\d\-\•\*]+\.?\s+', line):
                        element_type = "list"
                        level = None
                    else:
                        element_type = "paragraph"
                        level = None
                    
                    elements.append(PDFElement(
                        type=element_type,
                        content=line,
                        page=page_num,
                        level=level
                    ))
                
                # Extract tables
                tables = page.extract_tables()
                for table in tables:
                    if table:
                        elements.append(PDFElement(
                            type="table",
                            content=table,
                            page=page_num
                        ))
        
        return elements
    
    def _convert_to_html(self) -> str:
        """Convert elements to HTML with IDs and styling"""
        html_parts = ['''
<style>
    .pdf-content {
        font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
        line-height: 1.8;
        color: #333;
        max-width: 100%;
        padding: 20px;
    }
    .pdf-content h1, 
    .pdf-content h2, 
    .pdf-content h3 {
        color: #2c3e50;
        margin: 25px 0 15px 0;
        font-weight: 600;
    }
    .pdf-content h1 { font-size: 2em; border-bottom: 3px solid #667eea; padding-bottom: 10px; }
    .pdf-content h2 { font-size: 1.6em; border-bottom: 2px solid #e0e0e0; padding-bottom: 8px; }
    .pdf-content h3 { font-size: 1.3em; }
    .pdf-content table {
        border-collapse: collapse;
        width: 100%;
        margin: 20px 0;
        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
        border-radius: 8px;
        overflow: hidden;
    }
    .pdf-content th, 
    .pdf-content td {
        border: 1px solid #e0e0e0;
        padding: 12px 15px;
        text-align: left;
    }
    .pdf-content th {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        font-weight: 600;
        text-transform: uppercase;
        font-size: 0.9em;
        letter-spacing: 0.5px;
    }
    .pdf-content tr:nth-child(even) {
        background-color: #f8f9fa;
    }
    .pdf-content tr:hover {
        background-color: #e3f2fd;
        transition: background-color 0.2s;
    }
    .pdf-content p {
        margin: 12px 0;
        text-align: justify;
    }
    .pdf-content li {
        margin: 8px 0;
        margin-left: 25px;
    }
    .pdf-content .page-marker {
        color: #666;
        font-size: 0.95em;
        font-weight: 600;
        margin: 40px 0 20px 0;
        padding: 12px 20px;
        background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
        border-left: 5px solid #667eea;
        border-radius: 4px;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .pdf-content ul, .pdf-content ol {
        margin: 15px 0;
        padding-left: 30px;
    }
</style>
<div class="pdf-content">
''']
        
        current_page = 0
        in_list = False
        
        for elem in self.elements:
            # Add page marker
            if elem.page != current_page:
                if in_list:
                    html_parts.append('</ul>')
                    in_list = False
                current_page = elem.page
                html_parts.append(f'<div class="page-marker" id="page-{current_page}">📄 Page {current_page}</div>')
            
            if elem.type == "heading":
                if in_list:
                    html_parts.append('</ul>')
                    in_list = False
                level = elem.level or 2
                elem_id = self._get_element_id('heading')
                content = html.escape(elem.content)
                html_parts.append(f'<h{level} id="{elem_id}" data-page="{elem.page}">{content}</h{level}>')
            
            elif elem.type == "paragraph":
                if in_list:
                    html_parts.append('</ul>')
                    in_list = False
                elem_id = self._get_element_id('paragraph')
                content = html.escape(elem.content)
                html_parts.append(f'<p id="{elem_id}" data-page="{elem.page}">{content}</p>')
            
            elif elem.type == "list":
                if not in_list:
                    html_parts.append('<ul>')
                    in_list = True
                elem_id = self._get_element_id('list-item')
                content = html.escape(elem.content)
                html_parts.append(f'<li id="{elem_id}" data-page="{elem.page}">{content}</li>')
            
            elif elem.type == "table":
                if in_list:
                    html_parts.append('</ul>')
                    in_list = False
                elem_id = self._get_element_id('table')
                html_parts.append(f'<table id="{elem_id}" data-page="{elem.page}">')
                for i, row in enumerate(elem.content):
                    row_id = self._get_element_id('table-row')
                    html_parts.append(f'<tr id="{row_id}">')
                    tag = 'th' if i == 0 else 'td'
                    for j, cell in enumerate(row):
                        cell_id = self._get_element_id('table-cell')
                        cell_content = html.escape(str(cell)) if cell else ""
                        html_parts.append(f'<{tag} id="{cell_id}">{cell_content}</{tag}>')
                    html_parts.append('</tr>')
                html_parts.append('</table>')
        
        if in_list:
            html_parts.append('</ul>')
        
        html_parts.append('</div>')
        return '\n'.join(html_parts)


# Global processor
processor = PDFProcessor()


def process_pdf_url(pdf_url):
    """Process PDF from URL"""
    global processor
    
    if not pdf_url or not pdf_url.strip():
        return "❌ Please enter a PDF URL", "", ""
    
    try:
        processor = PDFProcessor()
        summary = processor.process_pdf(pdf_url.strip())
        
        summary_text = f"""### ✅ PDF Processed Successfully!

**📊 Summary:**
- **Total Elements:** {summary['total_elements']}
- **Pages:** {summary['pages']}
- **Headings:** {summary['headings']}
- **Tables:** {summary['tables']}
- **Paragraphs:** {summary['paragraphs']}
"""
        
        return summary_text, processor.html_content, processor.html_content
        
    except Exception as e:
        error_msg = f"❌ Error processing PDF: {str(e)}"
        return error_msg, "", ""


def create_download_file(html_content):
    if not html_content:
        return None
    
    # Create full HTML document
    full_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Extracted PDF Content</title>
</head>
<body>
{html_content}
</body>
</html>"""
    
    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.html', encoding='utf-8')
    temp_file.write(full_html)
    temp_file.close()
    return temp_file.name


# Create Gradio interface
with gr.Blocks(title="PDF to HTML Converter") as demo:
    
    gr.Markdown(
        """
        # 📄 PDF to HTML Converter
        
        Extract PDF content and view as beautifully structured HTML with unique IDs for each element.
        
        Simply paste a PDF URL and click **Process PDF** to get started!
        """
    )
    
    with gr.Row():
        with gr.Column(scale=4):
            pdf_url_input = gr.Textbox(
                label="PDF URL",
                placeholder="https://example.com/document.pdf"
            )
        with gr.Column(scale=1):
            process_btn = gr.Button("🚀 Process PDF", variant="primary")
    
    summary_output = gr.Markdown(label="Summary")
    
    gr.Markdown("---")
    
    with gr.Tabs():
        with gr.Tab("📋 HTML Preview"):
            html_preview = gr.HTML(label="Rendered HTML")
        
        with gr.Tab("💻 HTML Source"):
            html_source = gr.Code(
                label="HTML Source Code",
                language="html"
            )
            download_btn = gr.Button("📥 Download HTML")
            download_file = gr.File(label="Download", visible=False)
    
    # Event handlers
    process_btn.click(
        fn=process_pdf_url,
        inputs=[pdf_url_input],
        outputs=[summary_output, html_preview, html_source]
    )
    
    # Allow Enter key to process
    pdf_url_input.submit(
        fn=process_pdf_url,
        inputs=[pdf_url_input],
        outputs=[summary_output, html_preview, html_source]
    )
    
    download_btn.click(
        fn=create_download_file,
        inputs=[html_source],
        outputs=[download_file]
    )
    
    gr.Markdown(
        """
        ---
        ### 📌 Features:
        - ✨ Extracts text, tables, headings from PDFs
        - 🎯 Each HTML element has a unique ID
        - 📊 Beautiful table styling
        - 🔖 Page markers for easy navigation
        - 💾 Download extracted HTML
        
        ### 💡 Example PDFs to try:
        - Research papers from arXiv
        - Product documentation
        - Financial reports
        - Any publicly accessible PDF!
        """
    )


# Launch
if __name__ == "__main__":
    demo.launch()