Spaces:

internationalscholarsprogram
/

handbook-ocr-engine

Sleeping

File size: 6,113 Bytes

b12284c

"""Export service — renders edited handbook content to PDF, DOCX, HTML, or JSON."""

from __future__ import annotations

import io
import json
import logging
from pathlib import Path

from app.schemas.extraction import (
    BlockType,
    ContentBlock,
    ExportFormat,
    ExportRequest,
    PageResult,
    TableBlock,
)

logger = logging.getLogger(__name__)


# ── HTML builder ──


def _blocks_to_html(blocks: list[ContentBlock]) -> str:
    """Convert content blocks to HTML string."""
    parts: list[str] = []
    for b in blocks:
        if b.block_type == BlockType.HEADING:
            lvl = b.heading_level.value if b.heading_level else 2
            parts.append(f"<h{lvl}>{_esc(b.text)}</h{lvl}>")
        elif b.block_type == BlockType.PARAGRAPH:
            parts.append(f"<p>{_esc(b.text)}</p>")
        elif b.block_type == BlockType.LIST:
            items = "".join(f"<li>{_esc(it.text)}</li>" for it in b.list_items)
            parts.append(f"<ul>{items}</ul>")
        elif b.block_type == BlockType.TABLE and b.table:
            parts.append(_table_to_html(b.table))
    return "\n".join(parts)


def _table_to_html(table: TableBlock) -> str:
    """Render a TableBlock as an HTML <table>."""
    rows_map: dict[int, list[tuple[int, str, bool]]] = {}
    for c in table.cells:
        rows_map.setdefault(c.row, []).append((c.col, c.text, c.is_header))

    html = ['<table border="1" cellpadding="4" cellspacing="0" style="border-collapse:collapse;">']
    for ri in sorted(rows_map):
        html.append("  <tr>")
        cells = sorted(rows_map[ri], key=lambda x: x[0])
        for _ci, text, is_hdr in cells:
            tag = "th" if is_hdr else "td"
            html.append(f"    <{tag}>{_esc(text)}</{tag}>")
        html.append("  </tr>")
    html.append("</table>")
    return "\n".join(html)


def _esc(text: str) -> str:
    """Basic HTML entity escaping."""
    return (
        text
        .replace("&", "&amp;")
        .replace("<", "&lt;")
        .replace(">", "&gt;")
        .replace('"', "&quot;")
    )


def _full_html(title: str, body_html: str) -> str:
    return f"""<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="utf-8"/>

<title>{_esc(title)}</title>

<style>

  body {{ font-family: 'Segoe UI', Tahoma, sans-serif; max-width: 900px; margin: 2rem auto; padding: 0 1rem; color: #1a1a1a; line-height: 1.6; }}

  h1, h2, h3, h4, h5, h6 {{ color: #0b3f74; margin-top: 1.5em; }}

  table {{ width: 100%; margin: 1em 0; font-size: 0.9em; }}

  th {{ background: #0b3f74; color: #fff; text-align: left; }}

  td, th {{ padding: 6px 10px; }}

  tr:nth-child(even) {{ background: #f5f8fc; }}

  ul {{ margin: 0.5em 0; padding-left: 1.5em; }}

</style>

</head>

<body>

<h1>{_esc(title)}</h1>

{body_html}

</body>

</html>"""


# ── Export functions ──


def export_html(req: ExportRequest) -> bytes:
    """Export handbook pages as a single HTML document."""
    body_parts: list[str] = []
    for page in req.pages:
        body_parts.append(_blocks_to_html(page.blocks))
        body_parts.append("<hr/>")
    html = _full_html(req.title or "Handbook", "\n".join(body_parts))
    return html.encode("utf-8")


def export_json(req: ExportRequest) -> bytes:
    """Export the raw page data as JSON."""
    data = {
        "document_id": req.document_id,
        "title": req.title,
        "pages": [p.model_dump(mode="json") for p in req.pages],
    }
    return json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8")


def export_docx(req: ExportRequest) -> bytes:
    """Export handbook pages as a .docx Word document."""
    from docx import Document
    from docx.shared import Pt, Inches, RGBColor
    from docx.enum.text import WD_ALIGN_PARAGRAPH

    doc = Document()

    # Title
    title_para = doc.add_heading(req.title or "Handbook", level=0)
    for run in title_para.runs:
        run.font.color.rgb = RGBColor(0x0B, 0x3F, 0x74)

    for page in req.pages:
        for block in page.blocks:
            if block.block_type == BlockType.HEADING:
                lvl = block.heading_level.value if block.heading_level else 2
                lvl = min(lvl, 4)  # python-docx supports 0-9
                doc.add_heading(block.text, level=lvl)
            elif block.block_type == BlockType.PARAGRAPH:
                doc.add_paragraph(block.text)
            elif block.block_type == BlockType.LIST:
                for item in block.list_items:
                    doc.add_paragraph(item.text, style="List Bullet")
            elif block.block_type == BlockType.TABLE and block.table:
                tbl = block.table
                if tbl.rows > 0 and tbl.cols > 0:
                    word_table = doc.add_table(rows=tbl.rows, cols=tbl.cols, style="Table Grid")
                    for cell in tbl.cells:
                        if cell.row < tbl.rows and cell.col < tbl.cols:
                            word_table.cell(cell.row, cell.col).text = cell.text
        doc.add_page_break()

    buf = io.BytesIO()
    doc.save(buf)
    return buf.getvalue()


def export_pdf_html(req: ExportRequest) -> bytes:
    """Export as PDF via WeasyPrint (HTML→PDF)."""
    html_bytes = export_html(req)
    from weasyprint import HTML
    pdf_bytes = HTML(string=html_bytes.decode("utf-8")).write_pdf()
    return pdf_bytes


def export_document(req: ExportRequest) -> tuple[bytes, str, str]:
    """Dispatch export by format. Returns (bytes, content_type, extension)."""
    fmt = req.format
    if fmt == ExportFormat.HTML:
        return export_html(req), "text/html", "html"
    if fmt == ExportFormat.JSON:
        return export_json(req), "application/json", "json"
    if fmt == ExportFormat.DOCX:
        return export_docx(req), "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"
    if fmt == ExportFormat.PDF:
        return export_pdf_html(req), "application/pdf", "pdf"
    raise ValueError(f"Unsupported format: {fmt}")