handbook-ocr-engine / app /services /export_service.py
internationalscholarsprogram's picture
Initial deploy: ISP Handbook OCR Engine
b12284c verified
"""Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""
from __future__ import annotations
import io
import json
import logging
from pathlib import Path
from app.schemas.extraction import (
BlockType,
ContentBlock,
ExportFormat,
ExportRequest,
PageResult,
TableBlock,
)
logger = logging.getLogger(__name__)
# ── HTML builder ──
def _blocks_to_html(blocks: list[ContentBlock]) -> str:
"""Convert content blocks to HTML string."""
parts: list[str] = []
for b in blocks:
if b.block_type == BlockType.HEADING:
lvl = b.heading_level.value if b.heading_level else 2
parts.append(f"<h{lvl}>{_esc(b.text)}</h{lvl}>")
elif b.block_type == BlockType.PARAGRAPH:
parts.append(f"<p>{_esc(b.text)}</p>")
elif b.block_type == BlockType.LIST:
items = "".join(f"<li>{_esc(it.text)}</li>" for it in b.list_items)
parts.append(f"<ul>{items}</ul>")
elif b.block_type == BlockType.TABLE and b.table:
parts.append(_table_to_html(b.table))
return "\n".join(parts)
def _table_to_html(table: TableBlock) -> str:
"""Render a TableBlock as an HTML <table>."""
rows_map: dict[int, list[tuple[int, str, bool]]] = {}
for c in table.cells:
rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))
html = ['<table border="1" cellpadding="4" cellspacing="0" style="border-collapse:collapse;">']
for ri in sorted(rows_map):
html.append(" <tr>")
cells = sorted(rows_map[ri], key=lambda x: x[0])
for _ci, text, is_hdr in cells:
tag = "th" if is_hdr else "td"
html.append(f" <{tag}>{_esc(text)}</{tag}>")
html.append(" </tr>")
html.append("</table>")
return "\n".join(html)
def _esc(text: str) -> str:
"""Basic HTML entity escaping."""
return (
text
.replace("&", "&amp;")
.replace("<", "&lt;")
.replace(">", "&gt;")
.replace('"', "&quot;")
)
def _full_html(title: str, body_html: str) -> str:
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>{_esc(title)}</title>
<style>
body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; color: #1a1a1a; line-height: 1.6; }}
h1, h2, h3, h4, h5, h6 {{ color: #0b3f74; margin-top: 1.5em; }}
table {{ width: 100%; margin: 1em 0; font-size: 0.9em; }}
th {{ background: #0b3f74; color: #fff; text-align: left; }}
td, th {{ padding: 6px 10px; }}
tr:nth-child(even) {{ background: #f5f8fc; }}
ul {{ margin: 0.5em 0; padding-left: 1.5em; }}
</style>
</head>
<body>
<h1>{_esc(title)}</h1>
{body_html}
</body>
</html>"""
# ── Export functions ──
def export_html(req: ExportRequest) -> bytes:
"""Export handbook pages as a single HTML document."""
body_parts: list[str] = []
for page in req.pages:
body_parts.append(_blocks_to_html(page.blocks))
body_parts.append("<hr/>")
html = _full_html(req.title or "Handbook", "\n".join(body_parts))
return html.encode("utf-8")
def export_json(req: ExportRequest) -> bytes:
"""Export the raw page data as JSON."""
data = {
"document_id": req.document_id,
"title": req.title,
"pages": [p.model_dump(mode="json") for p in req.pages],
}
return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
def export_docx(req: ExportRequest) -> bytes:
"""Export handbook pages as a .docx Word document."""
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
doc = Document()
# Title
title_para = doc.add_heading(req.title or "Handbook", level=0)
for run in title_para.runs:
run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)
for page in req.pages:
for block in page.blocks:
if block.block_type == BlockType.HEADING:
lvl = block.heading_level.value if block.heading_level else 2
lvl = min(lvl, 4) # python-docx supports 0-9
doc.add_heading(block.text, level=lvl)
elif block.block_type == BlockType.PARAGRAPH:
doc.add_paragraph(block.text)
elif block.block_type == BlockType.LIST:
for item in block.list_items:
doc.add_paragraph(item.text, style="List Bullet")
elif block.block_type == BlockType.TABLE and block.table:
tbl = block.table
if tbl.rows > 0 and tbl.cols > 0:
word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
for cell in tbl.cells:
if cell.row < tbl.rows and cell.col < tbl.cols:
word_table.cell(cell.row, cell.col).text = cell.text
doc.add_page_break()
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def export_pdf_html(req: ExportRequest) -> bytes:
"""Export as PDF via WeasyPrint (HTML→PDF)."""
html_bytes = export_html(req)
from weasyprint import HTML
pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
return pdf_bytes
def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
"""Dispatch export by format. Returns (bytes, content_type, extension)."""
fmt = req.format
if fmt == ExportFormat.HTML:
return export_html(req), "text/html", "html"
if fmt == ExportFormat.JSON:
return export_json(req), "application/json", "json"
if fmt == ExportFormat.DOCX:
return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
if fmt == ExportFormat.PDF:
return export_pdf_html(req), "application/pdf", "pdf"
raise ValueError(f"Unsupported format: {fmt}")