Spaces:

internationalscholarsprogram
/

handbook-ocr-engine

Sleeping

App Files Files Community

handbook-ocr-engine / app /services /export_service.py

internationalscholarsprogram

Initial deploy: ISP Handbook OCR Engine

b12284c verified about 1 month ago

raw

history blame contribute delete

6.11 kB

	"""Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""

	from __future__ import annotations

	import io
	import json
	import logging
	from pathlib import Path

	from app.schemas.extraction import (
	BlockType,
	ContentBlock,
	ExportFormat,
	ExportRequest,
	PageResult,
	TableBlock,
	)

	logger = logging.getLogger(__name__)


	# ── HTML builder ──


	def _blocks_to_html(blocks: list[ContentBlock]) -> str:
	"""Convert content blocks to HTML string."""
	parts: list[str] = []
	for b in blocks:
	if b.block_type == BlockType.HEADING:
	lvl = b.heading_level.value if b.heading_level else 2
	parts.append(f"<h{lvl}>{_esc(b.text)}</h{lvl}>")
	elif b.block_type == BlockType.PARAGRAPH:
	parts.append(f"<p>{_esc(b.text)}</p>")
	elif b.block_type == BlockType.LIST:
	items = "".join(f"<li>{_esc(it.text)}</li>" for it in b.list_items)
	parts.append(f"<ul>{items}</ul>")
	elif b.block_type == BlockType.TABLE and b.table:
	parts.append(_table_to_html(b.table))
	return "\n".join(parts)


	def _table_to_html(table: TableBlock) -> str:
	"""Render a TableBlock as an HTML <table>."""
	rows_map: dict[int, list[tuple[int, str, bool]]] = {}
	for c in table.cells:
	rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))

	html = ['<table border="1" cellpadding="4" cellspacing="0" style="border-collapse:collapse;">']
	for ri in sorted(rows_map):
	html.append(" <tr>")
	cells = sorted(rows_map[ri], key=lambda x: x[0])
	for _ci, text, is_hdr in cells:
	tag = "th" if is_hdr else "td"
	html.append(f" <{tag}>{_esc(text)}</{tag}>")
	html.append(" </tr>")
	html.append("</table>")
	return "\n".join(html)


	def _esc(text: str) -> str:
	"""Basic HTML entity escaping."""
	return (
	text
	.replace("&", "&")
	.replace("<", "<")
	.replace(">", ">")
	.replace('"', """)
	)


	def _full_html(title: str, body_html: str) -> str:
	return f"""<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="utf-8"/>
	<title>{_esc(title)}</title>
	<style>
	body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; color: #1a1a1a; line-height: 1.6; }}
	h1, h2, h3, h4, h5, h6 {{ color: #0b3f74; margin-top: 1.5em; }}
	table {{ width: 100%; margin: 1em 0; font-size: 0.9em; }}
	th {{ background: #0b3f74; color: #fff; text-align: left; }}
	td, th {{ padding: 6px 10px; }}
	tr:nth-child(even) {{ background: #f5f8fc; }}
	ul {{ margin: 0.5em 0; padding-left: 1.5em; }}
	</style>
	</head>
	<body>
	<h1>{_esc(title)}</h1>
	{body_html}
	</body>
	</html>"""


	# ── Export functions ──


	def export_html(req: ExportRequest) -> bytes:
	"""Export handbook pages as a single HTML document."""
	body_parts: list[str] = []
	for page in req.pages:
	body_parts.append(_blocks_to_html(page.blocks))
	body_parts.append("<hr/>")
	html = _full_html(req.title or "Handbook", "\n".join(body_parts))
	return html.encode("utf-8")


	def export_json(req: ExportRequest) -> bytes:
	"""Export the raw page data as JSON."""
	data = {
	"document_id": req.document_id,
	"title": req.title,
	"pages": [p.model_dump(mode="json") for p in req.pages],
	}
	return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")


	def export_docx(req: ExportRequest) -> bytes:
	"""Export handbook pages as a .docx Word document."""
	from docx import Document
	from docx.shared import Pt, Inches, RGBColor
	from docx.enum.text import WD_ALIGN_PARAGRAPH

	doc = Document()

	# Title
	title_para = doc.add_heading(req.title or "Handbook", level=0)
	for run in title_para.runs:
	run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)

	for page in req.pages:
	for block in page.blocks:
	if block.block_type == BlockType.HEADING:
	lvl = block.heading_level.value if block.heading_level else 2
	lvl = min(lvl, 4) # python-docx supports 0-9
	doc.add_heading(block.text, level=lvl)
	elif block.block_type == BlockType.PARAGRAPH:
	doc.add_paragraph(block.text)
	elif block.block_type == BlockType.LIST:
	for item in block.list_items:
	doc.add_paragraph(item.text, style="List Bullet")
	elif block.block_type == BlockType.TABLE and block.table:
	tbl = block.table
	if tbl.rows > 0 and tbl.cols > 0:
	word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
	for cell in tbl.cells:
	if cell.row < tbl.rows and cell.col < tbl.cols:
	word_table.cell(cell.row, cell.col).text = cell.text
	doc.add_page_break()

	buf = io.BytesIO()
	doc.save(buf)
	return buf.getvalue()


	def export_pdf_html(req: ExportRequest) -> bytes:
	"""Export as PDF via WeasyPrint (HTML→PDF)."""
	html_bytes = export_html(req)
	from weasyprint import HTML
	pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
	return pdf_bytes


	def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
	"""Dispatch export by format. Returns (bytes, content_type, extension)."""
	fmt = req.format
	if fmt == ExportFormat.HTML:
	return export_html(req), "text/html", "html"
	if fmt == ExportFormat.JSON:
	return export_json(req), "application/json", "json"
	if fmt == ExportFormat.DOCX:
	return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
	if fmt == ExportFormat.PDF:
	return export_pdf_html(req), "application/pdf", "pdf"
	raise ValueError(f"Unsupported format: {fmt}")