"""Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""
from __future__ import annotations
import io
import json
import logging
from pathlib import Path
from app.schemas.extraction import (
BlockType,
ContentBlock,
ExportFormat,
ExportRequest,
PageResult,
TableBlock,
)
logger = logging.getLogger(__name__)
# ── HTML builder ──
def _blocks_to_html(blocks: list[ContentBlock]) -> str:
"""Convert content blocks to HTML string."""
parts: list[str] = []
for b in blocks:
if b.block_type == BlockType.HEADING:
lvl = b.heading_level.value if b.heading_level else 2
parts.append(f"{_esc(b.text)}")
elif b.block_type == BlockType.PARAGRAPH:
parts.append(f"
{_esc(b.text)}
")
elif b.block_type == BlockType.LIST:
items = "".join(f"{_esc(it.text)}" for it in b.list_items)
parts.append(f"")
elif b.block_type == BlockType.TABLE and b.table:
parts.append(_table_to_html(b.table))
return "\n".join(parts)
def _table_to_html(table: TableBlock) -> str:
"""Render a TableBlock as an HTML ."""
rows_map: dict[int, list[tuple[int, str, bool]]] = {}
for c in table.cells:
rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))
html = ['']
for ri in sorted(rows_map):
html.append(" ")
cells = sorted(rows_map[ri], key=lambda x: x[0])
for _ci, text, is_hdr in cells:
tag = "th" if is_hdr else "td"
html.append(f" <{tag}>{_esc(text)}{tag}>")
html.append("
")
html.append("
")
return "\n".join(html)
def _esc(text: str) -> str:
"""Basic HTML entity escaping."""
return (
text
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace('"', """)
)
def _full_html(title: str, body_html: str) -> str:
return f"""
{_esc(title)}
{_esc(title)}
{body_html}
"""
# ── Export functions ──
def export_html(req: ExportRequest) -> bytes:
"""Export handbook pages as a single HTML document."""
body_parts: list[str] = []
for page in req.pages:
body_parts.append(_blocks_to_html(page.blocks))
body_parts.append("
")
html = _full_html(req.title or "Handbook", "\n".join(body_parts))
return html.encode("utf-8")
def export_json(req: ExportRequest) -> bytes:
"""Export the raw page data as JSON."""
data = {
"document_id": req.document_id,
"title": req.title,
"pages": [p.model_dump(mode="json") for p in req.pages],
}
return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
def export_docx(req: ExportRequest) -> bytes:
"""Export handbook pages as a .docx Word document."""
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
doc = Document()
# Title
title_para = doc.add_heading(req.title or "Handbook", level=0)
for run in title_para.runs:
run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)
for page in req.pages:
for block in page.blocks:
if block.block_type == BlockType.HEADING:
lvl = block.heading_level.value if block.heading_level else 2
lvl = min(lvl, 4) # python-docx supports 0-9
doc.add_heading(block.text, level=lvl)
elif block.block_type == BlockType.PARAGRAPH:
doc.add_paragraph(block.text)
elif block.block_type == BlockType.LIST:
for item in block.list_items:
doc.add_paragraph(item.text, style="List Bullet")
elif block.block_type == BlockType.TABLE and block.table:
tbl = block.table
if tbl.rows > 0 and tbl.cols > 0:
word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
for cell in tbl.cells:
if cell.row < tbl.rows and cell.col < tbl.cols:
word_table.cell(cell.row, cell.col).text = cell.text
doc.add_page_break()
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def export_pdf_html(req: ExportRequest) -> bytes:
"""Export as PDF via WeasyPrint (HTML→PDF)."""
html_bytes = export_html(req)
from weasyprint import HTML
pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
return pdf_bytes
def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
"""Dispatch export by format. Returns (bytes, content_type, extension)."""
fmt = req.format
if fmt == ExportFormat.HTML:
return export_html(req), "text/html", "html"
if fmt == ExportFormat.JSON:
return export_json(req), "application/json", "json"
if fmt == ExportFormat.DOCX:
return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
if fmt == ExportFormat.PDF:
return export_pdf_html(req), "application/pdf", "pdf"
raise ValueError(f"Unsupported format: {fmt}")