"""Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON.""" from __future__ import annotations import io import json import logging from pathlib import Path from app.schemas.extraction import ( BlockType, ContentBlock, ExportFormat, ExportRequest, PageResult, TableBlock, ) logger = logging.getLogger(__name__) # ── HTML builder ── def _blocks_to_html(blocks: list[ContentBlock]) -> str: """Convert content blocks to HTML string.""" parts: list[str] = [] for b in blocks: if b.block_type == BlockType.HEADING: lvl = b.heading_level.value if b.heading_level else 2 parts.append(f"{_esc(b.text)}") elif b.block_type == BlockType.PARAGRAPH: parts.append(f"

{_esc(b.text)}

") elif b.block_type == BlockType.LIST: items = "".join(f"
  • {_esc(it.text)}
  • " for it in b.list_items) parts.append(f"") elif b.block_type == BlockType.TABLE and b.table: parts.append(_table_to_html(b.table)) return "\n".join(parts) def _table_to_html(table: TableBlock) -> str: """Render a TableBlock as an HTML .""" rows_map: dict[int, list[tuple[int, str, bool]]] = {} for c in table.cells: rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header)) html = ['
    '] for ri in sorted(rows_map): html.append(" ") cells = sorted(rows_map[ri], key=lambda x: x[0]) for _ci, text, is_hdr in cells: tag = "th" if is_hdr else "td" html.append(f" <{tag}>{_esc(text)}") html.append(" ") html.append("
    ") return "\n".join(html) def _esc(text: str) -> str: """Basic HTML entity escaping.""" return ( text .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace('"', """) ) def _full_html(title: str, body_html: str) -> str: return f""" {_esc(title)}

    {_esc(title)}

    {body_html} """ # ── Export functions ── def export_html(req: ExportRequest) -> bytes: """Export handbook pages as a single HTML document.""" body_parts: list[str] = [] for page in req.pages: body_parts.append(_blocks_to_html(page.blocks)) body_parts.append("
    ") html = _full_html(req.title or "Handbook", "\n".join(body_parts)) return html.encode("utf-8") def export_json(req: ExportRequest) -> bytes: """Export the raw page data as JSON.""" data = { "document_id": req.document_id, "title": req.title, "pages": [p.model_dump(mode="json") for p in req.pages], } return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8") def export_docx(req: ExportRequest) -> bytes: """Export handbook pages as a .docx Word document.""" from docx import Document from docx.shared import Pt, Inches, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH doc = Document() # Title title_para = doc.add_heading(req.title or "Handbook", level=0) for run in title_para.runs: run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74) for page in req.pages: for block in page.blocks: if block.block_type == BlockType.HEADING: lvl = block.heading_level.value if block.heading_level else 2 lvl = min(lvl, 4) # python-docx supports 0-9 doc.add_heading(block.text, level=lvl) elif block.block_type == BlockType.PARAGRAPH: doc.add_paragraph(block.text) elif block.block_type == BlockType.LIST: for item in block.list_items: doc.add_paragraph(item.text, style="List Bullet") elif block.block_type == BlockType.TABLE and block.table: tbl = block.table if tbl.rows > 0 and tbl.cols > 0: word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid") for cell in tbl.cells: if cell.row < tbl.rows and cell.col < tbl.cols: word_table.cell(cell.row, cell.col).text = cell.text doc.add_page_break() buf = io.BytesIO() doc.save(buf) return buf.getvalue() def export_pdf_html(req: ExportRequest) -> bytes: """Export as PDF via WeasyPrint (HTML→PDF).""" html_bytes = export_html(req) from weasyprint import HTML pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf() return pdf_bytes def export_document(req: ExportRequest) -> tuple[bytes, str, str]: """Dispatch export by format. Returns (bytes, content_type, extension).""" fmt = req.format if fmt == ExportFormat.HTML: return export_html(req), "text/html", "html" if fmt == ExportFormat.JSON: return export_json(req), "application/json", "json" if fmt == ExportFormat.DOCX: return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx" if fmt == ExportFormat.PDF: return export_pdf_html(req), "application/pdf", "pdf" raise ValueError(f"Unsupported format: {fmt}")