File size: 6,113 Bytes
b12284c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | """Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""
from __future__ import annotations
import io
import json
import logging
from pathlib import Path
from app.schemas.extraction import (
BlockType,
ContentBlock,
ExportFormat,
ExportRequest,
PageResult,
TableBlock,
)
logger = logging.getLogger(__name__)
# ── HTML builder ──
def _blocks_to_html(blocks: list[ContentBlock]) -> str:
"""Convert content blocks to HTML string."""
parts: list[str] = []
for b in blocks:
if b.block_type == BlockType.HEADING:
lvl = b.heading_level.value if b.heading_level else 2
parts.append(f"<h{lvl}>{_esc(b.text)}</h{lvl}>")
elif b.block_type == BlockType.PARAGRAPH:
parts.append(f"<p>{_esc(b.text)}</p>")
elif b.block_type == BlockType.LIST:
items = "".join(f"<li>{_esc(it.text)}</li>" for it in b.list_items)
parts.append(f"<ul>{items}</ul>")
elif b.block_type == BlockType.TABLE and b.table:
parts.append(_table_to_html(b.table))
return "\n".join(parts)
def _table_to_html(table: TableBlock) -> str:
"""Render a TableBlock as an HTML <table>."""
rows_map: dict[int, list[tuple[int, str, bool]]] = {}
for c in table.cells:
rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))
html = ['<table border="1" cellpadding="4" cellspacing="0" style="border-collapse:collapse;">']
for ri in sorted(rows_map):
html.append(" <tr>")
cells = sorted(rows_map[ri], key=lambda x: x[0])
for _ci, text, is_hdr in cells:
tag = "th" if is_hdr else "td"
html.append(f" <{tag}>{_esc(text)}</{tag}>")
html.append(" </tr>")
html.append("</table>")
return "\n".join(html)
def _esc(text: str) -> str:
"""Basic HTML entity escaping."""
return (
text
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace('"', """)
)
def _full_html(title: str, body_html: str) -> str:
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>{_esc(title)}</title>
<style>
body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; color: #1a1a1a; line-height: 1.6; }}
h1, h2, h3, h4, h5, h6 {{ color: #0b3f74; margin-top: 1.5em; }}
table {{ width: 100%; margin: 1em 0; font-size: 0.9em; }}
th {{ background: #0b3f74; color: #fff; text-align: left; }}
td, th {{ padding: 6px 10px; }}
tr:nth-child(even) {{ background: #f5f8fc; }}
ul {{ margin: 0.5em 0; padding-left: 1.5em; }}
</style>
</head>
<body>
<h1>{_esc(title)}</h1>
{body_html}
</body>
</html>"""
# ── Export functions ──
def export_html(req: ExportRequest) -> bytes:
"""Export handbook pages as a single HTML document."""
body_parts: list[str] = []
for page in req.pages:
body_parts.append(_blocks_to_html(page.blocks))
body_parts.append("<hr/>")
html = _full_html(req.title or "Handbook", "\n".join(body_parts))
return html.encode("utf-8")
def export_json(req: ExportRequest) -> bytes:
"""Export the raw page data as JSON."""
data = {
"document_id": req.document_id,
"title": req.title,
"pages": [p.model_dump(mode="json") for p in req.pages],
}
return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")
def export_docx(req: ExportRequest) -> bytes:
"""Export handbook pages as a .docx Word document."""
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
doc = Document()
# Title
title_para = doc.add_heading(req.title or "Handbook", level=0)
for run in title_para.runs:
run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)
for page in req.pages:
for block in page.blocks:
if block.block_type == BlockType.HEADING:
lvl = block.heading_level.value if block.heading_level else 2
lvl = min(lvl, 4) # python-docx supports 0-9
doc.add_heading(block.text, level=lvl)
elif block.block_type == BlockType.PARAGRAPH:
doc.add_paragraph(block.text)
elif block.block_type == BlockType.LIST:
for item in block.list_items:
doc.add_paragraph(item.text, style="List Bullet")
elif block.block_type == BlockType.TABLE and block.table:
tbl = block.table
if tbl.rows > 0 and tbl.cols > 0:
word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
for cell in tbl.cells:
if cell.row < tbl.rows and cell.col < tbl.cols:
word_table.cell(cell.row, cell.col).text = cell.text
doc.add_page_break()
buf = io.BytesIO()
doc.save(buf)
return buf.getvalue()
def export_pdf_html(req: ExportRequest) -> bytes:
"""Export as PDF via WeasyPrint (HTML→PDF)."""
html_bytes = export_html(req)
from weasyprint import HTML
pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
return pdf_bytes
def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
"""Dispatch export by format. Returns (bytes, content_type, extension)."""
fmt = req.format
if fmt == ExportFormat.HTML:
return export_html(req), "text/html", "html"
if fmt == ExportFormat.JSON:
return export_json(req), "application/json", "json"
if fmt == ExportFormat.DOCX:
return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
if fmt == ExportFormat.PDF:
return export_pdf_html(req), "application/pdf", "pdf"
raise ValueError(f"Unsupported format: {fmt}")
|