Spaces:

doeqoth
/

export

Build error

File size: 20,561 Bytes

0bde93f

"""
PDF to HTML Converter - Hugging Face Space
แปลง PDF เป็น HTML พร้อมรักษา layout และ text ที่เลือกได้

ใช้ PyMuPDF (fitz) - ไม่ต้องติดตั้ง pdf2htmlEX
"""

import base64
import io
import os
import tempfile
import time
from pathlib import Path

import fitz  # PyMuPDF
import gradio as gr

# ============ Configuration ============

TITLE = "📄 PDF to HTML Converter"
DESCRIPTION = """
แปลง PDF เป็น HTML ที่รักษา layout เหมือนต้นฉบับ พร้อม:
- ✅ รักษาตำแหน่งข้อความตาม PDF
- ✅ ข้อความเลือก/copy ได้
- ✅ เปิดได้ในทุก browser
- ✅ รองรับภาษาไทย

**วิธีใช้:** อัปโหลด PDF → ปรับตั้งค่า → กดแปลง → ดาวน์โหลด HTML
"""

# ============ Helper Functions ============


def get_file_size(size_bytes):
    """รับขนาดไฟล์แบบ human readable"""
    for unit in ["B", "KB", "MB", "GB"]:
        if size_bytes < 1024:
            return f"{size_bytes:.1f} {unit}"
        size_bytes /= 1024
    return f"{size_bytes:.1f} TB"


def extract_text_with_positions(page, scale=1.5):
    """ดึงข้อความพร้อมตำแหน่งจากหน้า PDF"""
    blocks = []

    # Get text blocks with positions
    text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)

    for block in text_dict.get("blocks", []):
        if block.get("type") == 0:  # Text block
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span.get("text", "").strip()
                    if not text:
                        continue

                    bbox = span.get("bbox", [0, 0, 0, 0])
                    font_size = span.get("size", 12)
                    font_name = span.get("font", "sans-serif")
                    color = span.get("color", 0)

                    # Convert color to hex
                    if isinstance(color, int):
                        hex_color = f"#{color:06x}"
                    else:
                        hex_color = "#000000"

                    blocks.append(
                        {
                            "text": text,
                            "x": bbox[0] * scale,
                            "y": bbox[1] * scale,
                            "width": (bbox[2] - bbox[0]) * scale,
                            "height": (bbox[3] - bbox[1]) * scale,
                            "font_size": font_size * scale,
                            "font_name": font_name,
                            "color": hex_color,
                        }
                    )

    return blocks


def render_page_as_image(page, scale=1.5, image_format="png"):
    """Render หน้า PDF เป็นรูปภาพ"""
    mat = fitz.Matrix(scale, scale)
    pix = page.get_pixmap(matrix=mat, alpha=False)

    if image_format == "png":
        img_data = pix.tobytes("png")
    else:
        img_data = pix.tobytes("jpeg")

    return base64.b64encode(img_data).decode("utf-8")


def generate_html(pages_data, title="PDF Document", include_background=True):
    """สร้าง HTML จากข้อมูลหน้า PDF"""

    html_pages = []

    for i, page_data in enumerate(pages_data):
        text_elements = []

        for block in page_data["texts"]:
            # Escape HTML
            text = (
                block["text"]
                .replace("&", "&amp;")
                .replace("<", "&lt;")
                .replace(">", "&gt;")
            )

            style = (
                f"""
                position: absolute;
                left: {block["x"]:.1f}px;
                top: {block["y"]:.1f}px;
                font-size: {block["font_size"]:.1f}px;
                color: {block["color"]};
                white-space: pre;
                pointer-events: auto;
                cursor: text;
                user-select: text;
            """.strip()
                .replace("\n", "")
                .replace("  ", " ")
            )

            text_elements.append(f'<span style="{style}">{text}</span>')

        # Background style
        if include_background and page_data.get("image"):
            bg_style = f"background-image: url('data:image/png;base64,{page_data['image']}'); background-size: cover;"
        else:
            bg_style = "background: white;"

        page_html = f"""
        <div class="page" style="width: {page_data["width"]:.0f}px; height: {page_data["height"]:.0f}px; {bg_style}">
            <div class="text-layer">
                {"".join(text_elements)}
            </div>
        </div>
        """
        html_pages.append(page_html)

    # Complete HTML document
    html = f"""<!DOCTYPE html>
<html lang="th">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title}</title>
    <style>
        * {{
            box-sizing: border-box;
            margin: 0;
            padding: 0;
        }}
        body {{
            font-family: 'Sarabun', 'Noto Sans Thai', sans-serif;
            background: #e2e8f0;
            padding: 20px;
            display: flex;
            flex-direction: column;
            align-items: center;
            gap: 20px;
        }}
        .page {{
            position: relative;
            background: white;
            box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
            overflow: hidden;
        }}
        .text-layer {{
            position: absolute;
            inset: 0;
            overflow: hidden;
        }}
        .text-layer span {{
            position: absolute;
        }}
        .text-layer span::selection {{
            background: rgba(37, 99, 235, 0.3);
        }}
        @media print {{
            body {{
                background: white;
                padding: 0;
                gap: 0;
            }}
            .page {{
                box-shadow: none;
                page-break-after: always;
            }}
        }}
    </style>
    <link href="https://fonts.googleapis.com/css2?family=Sarabun:wght@400;600;700&display=swap" rel="stylesheet">
</head>
<body>
    {"".join(html_pages)}
</body>
</html>
"""
    return html


def convert_pdf_to_html(
    pdf_file, scale, include_background, selected_pages, progress=gr.Progress()
):
    """
    แปลง PDF เป็น HTML โดยใช้ PyMuPDF
    """
    if pdf_file is None:
        return None, "❌ กรุณาอัปโหลดไฟล์ PDF", "", None

    progress(0.1, desc="กำลังเปิดไฟล์ PDF...")

    try:
        # Open PDF
        input_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
        doc = fitz.open(input_path)

        total_pages = len(doc)
        pdf_name = os.path.basename(input_path)
        pdf_base = os.path.splitext(pdf_name)[0]

        # Parse selected pages
        if selected_pages.strip():
            try:
                page_indices = []
                for part in selected_pages.split(","):
                    part = part.strip()
                    if "-" in part:
                        start, end = map(int, part.split("-"))
                        page_indices.extend(range(start - 1, min(end, total_pages)))
                    else:
                        idx = int(part) - 1
                        if 0 <= idx < total_pages:
                            page_indices.append(idx)
                page_indices = sorted(set(page_indices))
            except:
                page_indices = list(range(total_pages))
        else:
            page_indices = list(range(total_pages))

        if not page_indices:
            page_indices = list(range(total_pages))

        progress(0.2, desc=f"กำลังประมวลผล {len(page_indices)} หน้า...")

        pages_data = []
        start_time = time.time()

        for i, page_idx in enumerate(page_indices):
            progress(
                0.2 + (0.7 * (i + 1) / len(page_indices)),
                desc=f"กำลังแปลงหน้า {page_idx + 1}/{total_pages}...",
            )

            page = doc[page_idx]
            rect = page.rect

            # Get text with positions
            texts = extract_text_with_positions(page, scale)

            # Render page as image (optional)
            image = None
            if include_background:
                image = render_page_as_image(page, scale)

            pages_data.append(
                {
                    "page_num": page_idx + 1,
                    "width": rect.width * scale,
                    "height": rect.height * scale,
                    "texts": texts,
                    "image": image,
                }
            )

        doc.close()

        progress(0.9, desc="กำลังสร้าง HTML...")

        # Generate HTML
        html_content = generate_html(
            pages_data, title=pdf_base, include_background=include_background
        )

        # Save to temp file
        temp_dir = tempfile.mkdtemp()
        output_path = os.path.join(temp_dir, f"{pdf_base}.html")

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(html_content)

        elapsed = time.time() - start_time
        input_size = os.path.getsize(input_path)
        output_size = os.path.getsize(output_path)

        # Count total text blocks
        total_texts = sum(len(p["texts"]) for p in pages_data)

        progress(1.0, desc="เสร็จสิ้น!")

        # Status message
        status = f"""✅ **แปลงสำเร็จ!**

📊 **สถิติ:**
| รายการ | ค่า |
|--------|-----|
| หน้าที่แปลง | {len(page_indices)} / {total_pages} หน้า |
| ข้อความที่พบ | {total_texts} รายการ |
| ไฟล์ต้นฉบับ | {get_file_size(input_size)} |
| ไฟล์ HTML | {get_file_size(output_size)} |
| เวลาที่ใช้ | {elapsed:.1f} วินาที |
| Scale | {scale}x |
| รวม Background | {"✅" if include_background else "❌"} |
"""

        # Preview (first 50KB)
        preview = html_content[:50000]
        if len(html_content) > 50000:
            preview += "\n\n... (truncated for preview)"

        return output_path, status, preview, html_content

    except Exception as e:
        import traceback

        error_detail = traceback.format_exc()
        return None, f"❌ เกิดข้อผิดพลาด: {str(e)}\n\n```\n{error_detail}\n```", "", None


def extract_text_only(pdf_file, progress=gr.Progress()):
    """ดึงเฉพาะข้อความจาก PDF"""
    if pdf_file is None:
        return "❌ กรุณาอัปโหลดไฟล์ PDF"

    progress(0.2, desc="กำลังเปิดไฟล์...")

    try:
        input_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
        doc = fitz.open(input_path)

        all_text = []
        total_pages = len(doc)

        for i, page in enumerate(doc):
            progress(
                0.2 + (0.7 * (i + 1) / total_pages), desc=f"หน้า {i + 1}/{total_pages}"
            )
            text = page.get_text("text")
            if text.strip():
                all_text.append(f"--- หน้า {i + 1} ---\n{text}")

        doc.close()

        progress(1.0, desc="เสร็จสิ้น!")

        if all_text:
            return "\n\n".join(all_text)
        else:
            return "❌ ไม่พบข้อความใน PDF (อาจเป็นไฟล์ที่ scan มา)"

    except Exception as e:
        return f"❌ เกิดข้อผิดพลาด: {str(e)}"


def extract_as_json(pdf_file, scale, progress=gr.Progress()):
    """ดึงข้อมูลเป็น JSON พร้อมพิกัด"""
    if pdf_file is None:
        return "❌ กรุณาอัปโหลดไฟล์ PDF"

    progress(0.2, desc="กำลังเปิดไฟล์...")

    try:
        import json

        input_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
        doc = fitz.open(input_path)

        result = {
            "filename": os.path.basename(input_path),
            "total_pages": len(doc),
            "pages": [],
        }

        for i, page in enumerate(doc):
            progress(0.2 + (0.7 * (i + 1) / len(doc)), desc=f"หน้า {i + 1}/{len(doc)}")

            rect = page.rect
            texts = extract_text_with_positions(page, scale)

            result["pages"].append(
                {
                    "page_num": i + 1,
                    "width": rect.width * scale,
                    "height": rect.height * scale,
                    "text_count": len(texts),
                    "texts": texts,
                }
            )

        doc.close()

        progress(1.0, desc="เสร็จสิ้น!")

        return json.dumps(result, ensure_ascii=False, indent=2)

    except Exception as e:
        return f"❌ เกิดข้อผิดพลาด: {str(e)}"


# ============ Gradio Interface ============

with gr.Blocks(
    title=TITLE,
    theme=gr.themes.Soft(),
    css="""
    .output-html { max-height: 400px; overflow: auto; font-family: monospace; font-size: 12px; }
    .status-box { font-size: 14px; }
    footer { display: none !important; }
    """,
) as demo:
    gr.Markdown(f"# {TITLE}")
    gr.Markdown(DESCRIPTION)

    with gr.Tabs():
        # ============ Tab 1: PDF to HTML ============
        with gr.TabItem("📄 PDF → HTML", id="tab-html"):
            with gr.Row():
                with gr.Column(scale=1):
                    pdf_input = gr.File(
                        label="📁 อัปโหลด PDF", file_types=[".pdf"], type="filepath"
                    )

                    with gr.Accordion("⚙️ ตั้งค่า", open=True):
                        scale_slider = gr.Slider(
                            minimum=0.5,
                            maximum=3.0,
                            value=1.5,
                            step=0.1,
                            label="Scale (ความคมชัด)",
                            info="1.5 = 150%, 2.0 = 200%",
                        )

                        include_bg = gr.Checkbox(
                            value=True,
                            label="รวม Background (ภาพหน้า PDF)",
                            info="ปิดเพื่อได้ไฟล์เล็กลง แต่จะเห็นเฉพาะข้อความ",
                        )

                        pages_input = gr.Textbox(
                            label="เลือกหน้า (ว่าง = ทุกหน้า)",
                            placeholder="เช่น 1,3,5-10",
                            info="ระบุหน้าที่ต้องการ เช่น 1,2,3 หรือ 1-5,8,10-12",
                        )

                    convert_btn = gr.Button(
                        "🚀 แปลงเป็น HTML", variant="primary", size="lg"
                    )

                with gr.Column(scale=2):
                    html_output = gr.File(label="📥 ดาวน์โหลด HTML")

                    status_output = gr.Markdown(
                        label="สถานะ", elem_classes=["status-box"]
                    )

            with gr.Accordion("👁️ Preview HTML Code", open=False):
                preview_output = gr.Code(
                    label="HTML Preview", language="html", elem_classes=["output-html"]
                )

            # Hidden state for full HTML
            html_state = gr.State()

            convert_btn.click(
                fn=convert_pdf_to_html,
                inputs=[pdf_input, scale_slider, include_bg, pages_input],
                outputs=[html_output, status_output, preview_output, html_state],
            )

        # ============ Tab 2: Extract Text ============
        with gr.TabItem("📝 ดึงข้อความ", id="tab-text"):
            gr.Markdown("ดึงเฉพาะข้อความจาก PDF (เรียงตามหน้า)")

            with gr.Row():
                with gr.Column(scale=1):
                    pdf_text_input = gr.File(
                        label="📁 อัปโหลด PDF", file_types=[".pdf"], type="filepath"
                    )
                    extract_btn = gr.Button("📝 ดึงข้อความ", variant="primary")

                with gr.Column(scale=2):
                    text_output = gr.Textbox(
                        label="ข้อความที่ดึงได้",
                        lines=20,
                        max_lines=50,
                        show_copy_button=True,
                    )

            extract_btn.click(
                fn=extract_text_only, inputs=[pdf_text_input], outputs=[text_output]
            )

        # ============ Tab 3: Extract JSON ============
        with gr.TabItem("📊 Export JSON", id="tab-json"):
            gr.Markdown("ดึงข้อมูลเป็น JSON พร้อมพิกัด (x, y, width, height, font_size)")

            with gr.Row():
                with gr.Column(scale=1):
                    pdf_json_input = gr.File(
                        label="📁 อัปโหลด PDF", file_types=[".pdf"], type="filepath"
                    )
                    json_scale = gr.Slider(
                        minimum=0.5, maximum=3.0, value=1.0, step=0.1, label="Scale"
                    )
                    json_btn = gr.Button("📊 Export JSON", variant="primary")

                with gr.Column(scale=2):
                    json_output = gr.Code(
                        label="JSON Output", language="json", show_label=True
                    )

            json_btn.click(
                fn=extract_as_json,
                inputs=[pdf_json_input, json_scale],
                outputs=[json_output],
            )

        # ============ Tab 4: About ============
        with gr.TabItem("ℹ️ เกี่ยวกับ", id="tab-about"):
            gr.Markdown("""
## 🔧 เทคโนโลยีที่ใช้

- **[PyMuPDF (fitz)](https://pymupdf.readthedocs.io/)** - อ่านและประมวลผล PDF
- **[Gradio](https://gradio.app)** - สร้าง Web UI
- **[Hugging Face Spaces](https://huggingface.co/spaces)** - Hosting ฟรี

## 📋 Features

| Feature | Description |
|---------|-------------|
| **PDF → HTML** | แปลง PDF เป็น HTML ที่รักษา layout |
| **ดึงข้อความ** | ดึงเฉพาะ text จาก PDF |
| **Export JSON** | ดึงข้อมูลพร้อมพิกัด (x, y, size) |
| **เลือกหน้า** | เลือกแปลงเฉพาะหน้าที่ต้องการ |
| **ปรับ Scale** | ปรับความคมชัด 0.5x - 3.0x |

## 💡 Tips

1. **ไฟล์เล็กลง**: ปิด "รวม Background" จะได้ไฟล์ HTML เล็กลงมาก
2. **เลือกหน้า**: ใส่ `1-5` หรือ `1,3,5,7-10` เพื่อแปลงเฉพาะบางหน้า
3. **JSON**: ใช้สำหรับ import ข้อมูลไปใช้ในแอปอื่น

## ⚠️ ข้อจำกัด

- PDF ที่เป็นรูปภาพ (scanned) จะไม่มีข้อความให้ดึง
- ไฟล์ขนาดใหญ่มากอาจใช้เวลานานหรือ timeout
- บาง fonts พิเศษอาจแสดงผลไม่ถูกต้อง

## 📄 License

MIT License - ใช้งานได้ฟรี
            """)

    gr.Markdown("""
---
<center>Made with ❤️ using PyMuPDF & Gradio</center>
    """)


# ============ Launch ============

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)