Spaces:

doeqoth
/

export

Build error

App Files Files Community

export / app.py

doeqoth

Upload 4 files

0bde93f verified about 2 months ago

raw

history blame contribute delete

20.6 kB

	"""
	PDF to HTML Converter - Hugging Face Space
	แปลง PDF เป็น HTML พร้อมรักษา layout และ text ที่เลือกได้

	ใช้ PyMuPDF (fitz) - ไม่ต้องติดตั้ง pdf2htmlEX
	"""

	import base64
	import io
	import os
	import tempfile
	import time
	from pathlib import Path

	import fitz # PyMuPDF
	import gradio as gr

	# ============ Configuration ============

	TITLE = "📄 PDF to HTML Converter"
	DESCRIPTION = """
	แปลง PDF เป็น HTML ที่รักษา layout เหมือนต้นฉบับ พร้อม:
	- ✅ รักษาตำแหน่งข้อความตาม PDF
	- ✅ ข้อความเลือก/copy ได้
	- ✅ เปิดได้ในทุก browser
	- ✅ รองรับภาษาไทย

	วิธีใช้: อัปโหลด PDF → ปรับตั้งค่า → กดแปลง → ดาวน์โหลด HTML
	"""

	# ============ Helper Functions ============


	def get_file_size(size_bytes):
	"""รับขนาดไฟล์แบบ human readable"""
	for unit in ["B", "KB", "MB", "GB"]:
	if size_bytes < 1024:
	return f"{size_bytes:.1f} {unit}"
	size_bytes /= 1024
	return f"{size_bytes:.1f} TB"


	def extract_text_with_positions(page, scale=1.5):
	"""ดึงข้อความพร้อมตำแหน่งจากหน้า PDF"""
	blocks = []

	# Get text blocks with positions
	text_dict = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)

	for block in text_dict.get("blocks", []):
	if block.get("type") == 0: # Text block
	for line in block.get("lines", []):
	for span in line.get("spans", []):
	text = span.get("text", "").strip()
	if not text:
	continue

	bbox = span.get("bbox", [0, 0, 0, 0])
	font_size = span.get("size", 12)
	font_name = span.get("font", "sans-serif")
	color = span.get("color", 0)

	# Convert color to hex
	if isinstance(color, int):
	hex_color = f"#{color:06x}"
	else:
	hex_color = "#000000"

	blocks.append(
	{
	"text": text,
	"x": bbox[0] * scale,
	"y": bbox[1] * scale,
	"width": (bbox[2] - bbox[0]) * scale,
	"height": (bbox[3] - bbox[1]) * scale,
	"font_size": font_size * scale,
	"font_name": font_name,
	"color": hex_color,
	}
	)

	return blocks


	def render_page_as_image(page, scale=1.5, image_format="png"):
	"""Render หน้า PDF เป็นรูปภาพ"""
	mat = fitz.Matrix(scale, scale)
	pix = page.get_pixmap(matrix=mat, alpha=False)

	if image_format == "png":
	img_data = pix.tobytes("png")
	else:
	img_data = pix.tobytes("jpeg")

	return base64.b64encode(img_data).decode("utf-8")


	def generate_html(pages_data, title="PDF Document", include_background=True):
	"""สร้าง HTML จากข้อมูลหน้า PDF"""

	html_pages = []

	for i, page_data in enumerate(pages_data):
	text_elements = []

	for block in page_data["texts"]:
	# Escape HTML
	text = (
	block["text"]
	.replace("&", "&")
	.replace("<", "<")
	.replace(">", ">")
	)

	style = (
	f"""
	position: absolute;
	left: {block["x"]:.1f}px;
	top: {block["y"]:.1f}px;
	font-size: {block["font_size"]:.1f}px;
	color: {block["color"]};
	white-space: pre;
	pointer-events: auto;
	cursor: text;
	user-select: text;
	""".strip()
	.replace("\n", "")
	.replace(" ", " ")
	)

	text_elements.append(f'<span style="{style}">{text}</span>')

	# Background style
	if include_background and page_data.get("image"):
	bg_style = f"background-image: url('data:image/png;base64,{page_data['image']}'); background-size: cover;"
	else:
	bg_style = "background: white;"

	page_html = f"""
	<div class="page" style="width: {page_data["width"]:.0f}px; height: {page_data["height"]:.0f}px; {bg_style}">
	<div class="text-layer">
	{"".join(text_elements)}
	</div>
	</div>
	"""
	html_pages.append(page_html)

	# Complete HTML document
	html = f"""<!DOCTYPE html>
	<html lang="th">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>{title}</title>
	<style>
	* {{
	box-sizing: border-box;
	margin: 0;
	padding: 0;
	}}
	body {{
	font-family: 'Sarabun', 'Noto Sans Thai', sans-serif;
	background: #e2e8f0;
	padding: 20px;
	display: flex;
	flex-direction: column;
	align-items: center;
	gap: 20px;
	}}
	.page {{
	position: relative;
	background: white;
	box-shadow: 0 4px 20px rgba(0, 0, 0, 0.15);
	overflow: hidden;
	}}
	.text-layer {{
	position: absolute;
	inset: 0;
	overflow: hidden;
	}}
	.text-layer span {{
	position: absolute;
	}}
	.text-layer span::selection {{
	background: rgba(37, 99, 235, 0.3);
	}}
	@media print {{
	body {{
	background: white;
	padding: 0;
	gap: 0;
	}}
	.page {{
	box-shadow: none;
	page-break-after: always;
	}}
	}}
	</style>
	<link href="https://fonts.googleapis.com/css2?family=Sarabun:wght@400;600;700&display=swap" rel="stylesheet">
	</head>
	<body>
	{"".join(html_pages)}
	</body>
	</html>
	"""
	return html


	def convert_pdf_to_html(
	pdf_file, scale, include_background, selected_pages, progress=gr.Progress()
	):
	"""
	แปลง PDF เป็น HTML โดยใช้ PyMuPDF
	"""
	if pdf_file is None:
	return None, "❌ กรุณาอัปโหลดไฟล์ PDF", "", None

	progress(0.1, desc="กำลังเปิดไฟล์ PDF...")

	try:
	# Open PDF
	input_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
	doc = fitz.open(input_path)

	total_pages = len(doc)
	pdf_name = os.path.basename(input_path)
	pdf_base = os.path.splitext(pdf_name)[0]

	# Parse selected pages
	if selected_pages.strip():
	try:
	page_indices = []
	for part in selected_pages.split(","):
	part = part.strip()
	if "-" in part:
	start, end = map(int, part.split("-"))
	page_indices.extend(range(start - 1, min(end, total_pages)))
	else:
	idx = int(part) - 1
	if 0 <= idx < total_pages:
	page_indices.append(idx)
	page_indices = sorted(set(page_indices))
	except:
	page_indices = list(range(total_pages))
	else:
	page_indices = list(range(total_pages))

	if not page_indices:
	page_indices = list(range(total_pages))

	progress(0.2, desc=f"กำลังประมวลผล {len(page_indices)} หน้า...")

	pages_data = []
	start_time = time.time()

	for i, page_idx in enumerate(page_indices):
	progress(
	0.2 + (0.7 * (i + 1) / len(page_indices)),
	desc=f"กำลังแปลงหน้า {page_idx + 1}/{total_pages}...",
	)

	page = doc[page_idx]
	rect = page.rect

	# Get text with positions
	texts = extract_text_with_positions(page, scale)

	# Render page as image (optional)
	image = None
	if include_background:
	image = render_page_as_image(page, scale)

	pages_data.append(
	{
	"page_num": page_idx + 1,
	"width": rect.width * scale,
	"height": rect.height * scale,
	"texts": texts,
	"image": image,
	}
	)

	doc.close()

	progress(0.9, desc="กำลังสร้าง HTML...")

	# Generate HTML
	html_content = generate_html(
	pages_data, title=pdf_base, include_background=include_background
	)

	# Save to temp file
	temp_dir = tempfile.mkdtemp()
	output_path = os.path.join(temp_dir, f"{pdf_base}.html")

	with open(output_path, "w", encoding="utf-8") as f:
	f.write(html_content)

	elapsed = time.time() - start_time
	input_size = os.path.getsize(input_path)
	output_size = os.path.getsize(output_path)

	# Count total text blocks
	total_texts = sum(len(p["texts"]) for p in pages_data)

	progress(1.0, desc="เสร็จสิ้น!")

	# Status message
	status = f"""✅ แปลงสำเร็จ!

	📊 สถิติ:
	\| รายการ \| ค่า \|
	\|--------\|-----\|
	\| หน้าที่แปลง \| {len(page_indices)} / {total_pages} หน้า \|
	\| ข้อความที่พบ \| {total_texts} รายการ \|
	\| ไฟล์ต้นฉบับ \| {get_file_size(input_size)} \|
	\| ไฟล์ HTML \| {get_file_size(output_size)} \|
	\| เวลาที่ใช้ \| {elapsed:.1f} วินาที \|
	\| Scale \| {scale}x \|
	\| รวม Background \| {"✅" if include_background else "❌"} \|
	"""

	# Preview (first 50KB)
	preview = html_content[:50000]
	if len(html_content) > 50000:
	preview += "\n\n... (truncated for preview)"

	return output_path, status, preview, html_content

	except Exception as e:
	import traceback

	error_detail = traceback.format_exc()
	return None, f"❌ เกิดข้อผิดพลาด: {str(e)}\n\n```\n{error_detail}\n```", "", None


	def extract_text_only(pdf_file, progress=gr.Progress()):
	"""ดึงเฉพาะข้อความจาก PDF"""
	if pdf_file is None:
	return "❌ กรุณาอัปโหลดไฟล์ PDF"

	progress(0.2, desc="กำลังเปิดไฟล์...")

	try:
	input_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
	doc = fitz.open(input_path)

	all_text = []
	total_pages = len(doc)

	for i, page in enumerate(doc):
	progress(
	0.2 + (0.7 * (i + 1) / total_pages), desc=f"หน้า {i + 1}/{total_pages}"
	)
	text = page.get_text("text")
	if text.strip():
	all_text.append(f"--- หน้า {i + 1} ---\n{text}")

	doc.close()

	progress(1.0, desc="เสร็จสิ้น!")

	if all_text:
	return "\n\n".join(all_text)
	else:
	return "❌ ไม่พบข้อความใน PDF (อาจเป็นไฟล์ที่ scan มา)"

	except Exception as e:
	return f"❌ เกิดข้อผิดพลาด: {str(e)}"


	def extract_as_json(pdf_file, scale, progress=gr.Progress()):
	"""ดึงข้อมูลเป็น JSON พร้อมพิกัด"""
	if pdf_file is None:
	return "❌ กรุณาอัปโหลดไฟล์ PDF"

	progress(0.2, desc="กำลังเปิดไฟล์...")

	try:
	import json

	input_path = pdf_file.name if hasattr(pdf_file, "name") else pdf_file
	doc = fitz.open(input_path)

	result = {
	"filename": os.path.basename(input_path),
	"total_pages": len(doc),
	"pages": [],
	}

	for i, page in enumerate(doc):
	progress(0.2 + (0.7 * (i + 1) / len(doc)), desc=f"หน้า {i + 1}/{len(doc)}")

	rect = page.rect
	texts = extract_text_with_positions(page, scale)

	result["pages"].append(
	{
	"page_num": i + 1,
	"width": rect.width * scale,
	"height": rect.height * scale,
	"text_count": len(texts),
	"texts": texts,
	}
	)

	doc.close()

	progress(1.0, desc="เสร็จสิ้น!")

	return json.dumps(result, ensure_ascii=False, indent=2)

	except Exception as e:
	return f"❌ เกิดข้อผิดพลาด: {str(e)}"


	# ============ Gradio Interface ============

	with gr.Blocks(
	title=TITLE,
	theme=gr.themes.Soft(),
	css="""
	.output-html { max-height: 400px; overflow: auto; font-family: monospace; font-size: 12px; }
	.status-box { font-size: 14px; }
	footer { display: none !important; }
	""",
	) as demo:
	gr.Markdown(f"# {TITLE}")
	gr.Markdown(DESCRIPTION)

	with gr.Tabs():
	# ============ Tab 1: PDF to HTML ============
	with gr.TabItem("📄 PDF → HTML", id="tab-html"):
	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="📁 อัปโหลด PDF", file_types=[".pdf"], type="filepath"
	)

	with gr.Accordion("⚙️ ตั้งค่า", open=True):
	scale_slider = gr.Slider(
	minimum=0.5,
	maximum=3.0,
	value=1.5,
	step=0.1,
	label="Scale (ความคมชัด)",
	info="1.5 = 150%, 2.0 = 200%",
	)

	include_bg = gr.Checkbox(
	value=True,
	label="รวม Background (ภาพหน้า PDF)",
	info="ปิดเพื่อได้ไฟล์เล็กลง แต่จะเห็นเฉพาะข้อความ",
	)

	pages_input = gr.Textbox(
	label="เลือกหน้า (ว่าง = ทุกหน้า)",
	placeholder="เช่น 1,3,5-10",
	info="ระบุหน้าที่ต้องการ เช่น 1,2,3 หรือ 1-5,8,10-12",
	)

	convert_btn = gr.Button(
	"🚀 แปลงเป็น HTML", variant="primary", size="lg"
	)

	with gr.Column(scale=2):
	html_output = gr.File(label="📥 ดาวน์โหลด HTML")

	status_output = gr.Markdown(
	label="สถานะ", elem_classes=["status-box"]
	)

	with gr.Accordion("👁️ Preview HTML Code", open=False):
	preview_output = gr.Code(
	label="HTML Preview", language="html", elem_classes=["output-html"]
	)

	# Hidden state for full HTML
	html_state = gr.State()

	convert_btn.click(
	fn=convert_pdf_to_html,
	inputs=[pdf_input, scale_slider, include_bg, pages_input],
	outputs=[html_output, status_output, preview_output, html_state],
	)

	# ============ Tab 2: Extract Text ============
	with gr.TabItem("📝 ดึงข้อความ", id="tab-text"):
	gr.Markdown("ดึงเฉพาะข้อความจาก PDF (เรียงตามหน้า)")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_text_input = gr.File(
	label="📁 อัปโหลด PDF", file_types=[".pdf"], type="filepath"
	)
	extract_btn = gr.Button("📝 ดึงข้อความ", variant="primary")

	with gr.Column(scale=2):
	text_output = gr.Textbox(
	label="ข้อความที่ดึงได้",
	lines=20,
	max_lines=50,
	show_copy_button=True,
	)

	extract_btn.click(
	fn=extract_text_only, inputs=[pdf_text_input], outputs=[text_output]
	)

	# ============ Tab 3: Extract JSON ============
	with gr.TabItem("📊 Export JSON", id="tab-json"):
	gr.Markdown("ดึงข้อมูลเป็น JSON พร้อมพิกัด (x, y, width, height, font_size)")

	with gr.Row():
	with gr.Column(scale=1):
	pdf_json_input = gr.File(
	label="📁 อัปโหลด PDF", file_types=[".pdf"], type="filepath"
	)
	json_scale = gr.Slider(
	minimum=0.5, maximum=3.0, value=1.0, step=0.1, label="Scale"
	)
	json_btn = gr.Button("📊 Export JSON", variant="primary")

	with gr.Column(scale=2):
	json_output = gr.Code(
	label="JSON Output", language="json", show_label=True
	)

	json_btn.click(
	fn=extract_as_json,
	inputs=[pdf_json_input, json_scale],
	outputs=[json_output],
	)

	# ============ Tab 4: About ============
	with gr.TabItem("ℹ️ เกี่ยวกับ", id="tab-about"):
	gr.Markdown("""
	## 🔧 เทคโนโลยีที่ใช้

	- [PyMuPDF (fitz)](https://pymupdf.readthedocs.io/) - อ่านและประมวลผล PDF
	- [Gradio](https://gradio.app) - สร้าง Web UI
	- [Hugging Face Spaces](https://huggingface.co/spaces) - Hosting ฟรี

	## 📋 Features

	\| Feature \| Description \|
	\|---------\|-------------\|
	\| PDF → HTML \| แปลง PDF เป็น HTML ที่รักษา layout \|
	\| ดึงข้อความ \| ดึงเฉพาะ text จาก PDF \|
	\| Export JSON \| ดึงข้อมูลพร้อมพิกัด (x, y, size) \|
	\| เลือกหน้า \| เลือกแปลงเฉพาะหน้าที่ต้องการ \|
	\| ปรับ Scale \| ปรับความคมชัด 0.5x - 3.0x \|

	## 💡 Tips

	1. ไฟล์เล็กลง: ปิด "รวม Background" จะได้ไฟล์ HTML เล็กลงมาก
	2. เลือกหน้า: ใส่ `1-5` หรือ `1,3,5,7-10` เพื่อแปลงเฉพาะบางหน้า
	3. JSON: ใช้สำหรับ import ข้อมูลไปใช้ในแอปอื่น

	## ⚠️ ข้อจำกัด

	- PDF ที่เป็นรูปภาพ (scanned) จะไม่มีข้อความให้ดึง
	- ไฟล์ขนาดใหญ่มากอาจใช้เวลานานหรือ timeout
	- บาง fonts พิเศษอาจแสดงผลไม่ถูกต้อง

	## 📄 License

	MIT License - ใช้งานได้ฟรี
	""")

	gr.Markdown("""
	---
	<center>Made with ❤️ using PyMuPDF & Gradio</center>
	""")


	# ============ Launch ============

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False)