File size: 6,113 Bytes
b12284c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
"""Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""

from __future__ import annotations

import io
import json
import logging
from pathlib import Path

from app.schemas.extraction import (
    BlockType,
    ContentBlock,
    ExportFormat,
    ExportRequest,
    PageResult,
    TableBlock,
)

logger = logging.getLogger(__name__)


# ── HTML builder ──


def _blocks_to_html(blocks: list[ContentBlock]) -> str:
    """Convert content blocks to HTML string."""
    parts: list[str] = []
    for b in blocks:
        if b.block_type == BlockType.HEADING:
            lvl = b.heading_level.value if b.heading_level else 2
            parts.append(f"<h{lvl}>{_esc(b.text)}</h{lvl}>")
        elif b.block_type == BlockType.PARAGRAPH:
            parts.append(f"<p>{_esc(b.text)}</p>")
        elif b.block_type == BlockType.LIST:
            items = "".join(f"<li>{_esc(it.text)}</li>" for it in b.list_items)
            parts.append(f"<ul>{items}</ul>")
        elif b.block_type == BlockType.TABLE and b.table:
            parts.append(_table_to_html(b.table))
    return "\n".join(parts)


def _table_to_html(table: TableBlock) -> str:
    """Render a TableBlock as an HTML <table>."""
    rows_map: dict[int, list[tuple[int, str, bool]]] = {}
    for c in table.cells:
        rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))

    html = ['<table border="1" cellpadding="4" cellspacing="0" style="border-collapse:collapse;">']
    for ri in sorted(rows_map):
        html.append("  <tr>")
        cells = sorted(rows_map[ri], key=lambda x: x[0])
        for _ci, text, is_hdr in cells:
            tag = "th" if is_hdr else "td"
            html.append(f"    <{tag}>{_esc(text)}</{tag}>")
        html.append("  </tr>")
    html.append("</table>")
    return "\n".join(html)


def _esc(text: str) -> str:
    """Basic HTML entity escaping."""
    return (
        text
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
    )


def _full_html(title: str, body_html: str) -> str:
    return f"""<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="utf-8"/>

<title>{_esc(title)}</title>

<style>

  body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; color: #1a1a1a; line-height: 1.6; }}

  h1, h2, h3, h4, h5, h6 {{ color: #0b3f74; margin-top: 1.5em; }}

  table {{ width: 100%; margin: 1em 0; font-size: 0.9em; }}

  th {{ background: #0b3f74; color: #fff; text-align: left; }}

  td, th {{ padding: 6px 10px; }}

  tr:nth-child(even) {{ background: #f5f8fc; }}

  ul {{ margin: 0.5em 0; padding-left: 1.5em; }}

</style>

</head>

<body>

<h1>{_esc(title)}</h1>

{body_html}

</body>

</html>"""


# ── Export functions ──


def export_html(req: ExportRequest) -> bytes:
    """Export handbook pages as a single HTML document."""
    body_parts: list[str] = []
    for page in req.pages:
        body_parts.append(_blocks_to_html(page.blocks))
        body_parts.append("<hr/>")
    html = _full_html(req.title or "Handbook", "\n".join(body_parts))
    return html.encode("utf-8")


def export_json(req: ExportRequest) -> bytes:
    """Export the raw page data as JSON."""
    data = {
        "document_id": req.document_id,
        "title": req.title,
        "pages": [p.model_dump(mode="json") for p in req.pages],
    }
    return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")


def export_docx(req: ExportRequest) -> bytes:
    """Export handbook pages as a .docx Word document."""
    from docx import Document
    from docx.shared import Pt, Inches, RGBColor
    from docx.enum.text import WD_ALIGN_PARAGRAPH

    doc = Document()

    # Title
    title_para = doc.add_heading(req.title or "Handbook", level=0)
    for run in title_para.runs:
        run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)

    for page in req.pages:
        for block in page.blocks:
            if block.block_type == BlockType.HEADING:
                lvl = block.heading_level.value if block.heading_level else 2
                lvl = min(lvl, 4)  # python-docx supports 0-9
                doc.add_heading(block.text, level=lvl)
            elif block.block_type == BlockType.PARAGRAPH:
                doc.add_paragraph(block.text)
            elif block.block_type == BlockType.LIST:
                for item in block.list_items:
                    doc.add_paragraph(item.text, style="List Bullet")
            elif block.block_type == BlockType.TABLE and block.table:
                tbl = block.table
                if tbl.rows > 0 and tbl.cols > 0:
                    word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
                    for cell in tbl.cells:
                        if cell.row < tbl.rows and cell.col < tbl.cols:
                            word_table.cell(cell.row, cell.col).text = cell.text
        doc.add_page_break()

    buf = io.BytesIO()
    doc.save(buf)
    return buf.getvalue()


def export_pdf_html(req: ExportRequest) -> bytes:
    """Export as PDF via WeasyPrint (HTML→PDF)."""
    html_bytes = export_html(req)
    from weasyprint import HTML
    pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
    return pdf_bytes


def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
    """Dispatch export by format. Returns (bytes, content_type, extension)."""
    fmt = req.format
    if fmt == ExportFormat.HTML:
        return export_html(req), "text/html", "html"
    if fmt == ExportFormat.JSON:
        return export_json(req), "application/json", "json"
    if fmt == ExportFormat.DOCX:
        return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
    if fmt == ExportFormat.PDF:
        return export_pdf_html(req), "application/pdf", "pdf"
    raise ValueError(f"Unsupported format: {fmt}")