| """Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""
|
|
|
| from __future__ import annotations
|
|
|
| import io
|
| import json
|
| import logging
|
| from pathlib import Path
|
|
|
| from app.schemas.extraction import (
|
| BlockType,
|
| ContentBlock,
|
| ExportFormat,
|
| ExportRequest,
|
| PageResult,
|
| TableBlock,
|
| )
|
|
|
| logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
| def _blocks_to_html(blocks: list[ContentBlock]) -> str:
|
| """Convert content blocks to HTML string."""
|
| parts: list[str] = []
|
| for b in blocks:
|
| if b.block_type == BlockType.HEADING:
|
| lvl = b.heading_level.value if b.heading_level else 2
|
| parts.append(f"<h{lvl}>{_esc(b.text)}</h{lvl}>")
|
| elif b.block_type == BlockType.PARAGRAPH:
|
| parts.append(f"<p>{_esc(b.text)}</p>")
|
| elif b.block_type == BlockType.LIST:
|
| items = "".join(f"<li>{_esc(it.text)}</li>" for it in b.list_items)
|
| parts.append(f"<ul>{items}</ul>")
|
| elif b.block_type == BlockType.TABLE and b.table:
|
| parts.append(_table_to_html(b.table))
|
| return "\n".join(parts)
|
|
|
|
|
| def _table_to_html(table: TableBlock) -> str:
|
| """Render a TableBlock as an HTML <table>."""
|
| rows_map: dict[int, list[tuple[int, str, bool]]] = {}
|
| for c in table.cells:
|
| rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))
|
|
|
| html = ['<table border="1" cellpadding="4" cellspacing="0" style="border-collapse:collapse;">']
|
| for ri in sorted(rows_map):
|
| html.append(" <tr>")
|
| cells = sorted(rows_map[ri], key=lambda x: x[0])
|
| for _ci, text, is_hdr in cells:
|
| tag = "th" if is_hdr else "td"
|
| html.append(f" <{tag}>{_esc(text)}</{tag}>")
|
| html.append(" </tr>")
|
| html.append("</table>")
|
| return "\n".join(html)
|
|
|
|
|
| def _esc(text: str) -> str:
|
| """Basic HTML entity escaping."""
|
| return (
|
| text
|
| .replace("&", "&")
|
| .replace("<", "<")
|
| .replace(">", ">")
|
| .replace('"', """)
|
| )
|
|
|
|
|
| def _full_html(title: str, body_html: str) -> str:
|
| return f"""<!DOCTYPE html>
|
| <html lang="en">
|
| <head>
|
| <meta charset="utf-8"/>
|
| <title>{_esc(title)}</title>
|
| <style>
|
| body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; color: #1a1a1a; line-height: 1.6; }}
|
| h1, h2, h3, h4, h5, h6 {{ color: #0b3f74; margin-top: 1.5em; }}
|
| table {{ width: 100%; margin: 1em 0; font-size: 0.9em; }}
|
| th {{ background: #0b3f74; color: #fff; text-align: left; }}
|
| td, th {{ padding: 6px 10px; }}
|
| tr:nth-child(even) {{ background: #f5f8fc; }}
|
| ul {{ margin: 0.5em 0; padding-left: 1.5em; }}
|
| </style>
|
| </head>
|
| <body>
|
| <h1>{_esc(title)}</h1>
|
| {body_html}
|
| </body>
|
| </html>"""
|
|
|
|
|
|
|
|
|
|
|
| def export_html(req: ExportRequest) -> bytes:
|
| """Export handbook pages as a single HTML document."""
|
| body_parts: list[str] = []
|
| for page in req.pages:
|
| body_parts.append(_blocks_to_html(page.blocks))
|
| body_parts.append("<hr/>")
|
| html = _full_html(req.title or "Handbook", "\n".join(body_parts))
|
| return html.encode("utf-8")
|
|
|
|
|
| def export_json(req: ExportRequest) -> bytes:
|
| """Export the raw page data as JSON."""
|
| data = {
|
| "document_id": req.document_id,
|
| "title": req.title,
|
| "pages": [p.model_dump(mode="json") for p in req.pages],
|
| }
|
| return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
|
|
|
|
|
| def export_docx(req: ExportRequest) -> bytes:
|
| """Export handbook pages as a .docx Word document."""
|
| from docx import Document
|
| from docx.shared import Pt, Inches, RGBColor
|
| from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
|
| doc = Document()
|
|
|
|
|
| title_para = doc.add_heading(req.title or "Handbook", level=0)
|
| for run in title_para.runs:
|
| run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)
|
|
|
| for page in req.pages:
|
| for block in page.blocks:
|
| if block.block_type == BlockType.HEADING:
|
| lvl = block.heading_level.value if block.heading_level else 2
|
| lvl = min(lvl, 4)
|
| doc.add_heading(block.text, level=lvl)
|
| elif block.block_type == BlockType.PARAGRAPH:
|
| doc.add_paragraph(block.text)
|
| elif block.block_type == BlockType.LIST:
|
| for item in block.list_items:
|
| doc.add_paragraph(item.text, style="List Bullet")
|
| elif block.block_type == BlockType.TABLE and block.table:
|
| tbl = block.table
|
| if tbl.rows > 0 and tbl.cols > 0:
|
| word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
|
| for cell in tbl.cells:
|
| if cell.row < tbl.rows and cell.col < tbl.cols:
|
| word_table.cell(cell.row, cell.col).text = cell.text
|
| doc.add_page_break()
|
|
|
| buf = io.BytesIO()
|
| doc.save(buf)
|
| return buf.getvalue()
|
|
|
|
|
| def export_pdf_html(req: ExportRequest) -> bytes:
|
| """Export as PDF via WeasyPrint (HTML→PDF)."""
|
| html_bytes = export_html(req)
|
| from weasyprint import HTML
|
| pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
|
| return pdf_bytes
|
|
|
|
|
| def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
|
| """Dispatch export by format. Returns (bytes, content_type, extension)."""
|
| fmt = req.format
|
| if fmt == ExportFormat.HTML:
|
| return export_html(req), "text/html", "html"
|
| if fmt == ExportFormat.JSON:
|
| return export_json(req), "application/json", "json"
|
| if fmt == ExportFormat.DOCX:
|
| return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
|
| if fmt == ExportFormat.PDF:
|
| return export_pdf_html(req), "application/pdf", "pdf"
|
| raise ValueError(f"Unsupported format: {fmt}")
|
|
|