Spaces:

MatteoScript
/

BilancioCompetenze

Sleeping

File size: 4,029 Bytes
"""DOCX -> PDF conversion (no Aspose).

Obiettivo: ottenere un PDF *fedele* al template Word (grafica, tabelle, immagini, header/footer).

Scelta converter:
1) LibreOffice headless (consigliato su Linux / HF Spaces)  ✅ alta fedeltà
2) docx2pdf (solo Windows/macOS con MS Word)
3) fallback PyMuPDF (bassa fedeltà: solo come ultima spiaggia)

Su HuggingFace Spaces puoi installare LibreOffice con `packages.txt`:
- libreoffice
- libreoffice-writer
- fonts-dejavu-core (o altri font richiesti dal template)

Questo non richiede Aspose.
"""

from __future__ import annotations

import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import fitz  # PyMuPDF


@dataclass
class PdfConvertResult:
    backend: str
    removed_blank_pages: int = 0


def _loffice_available() -> Optional[str]:
    return shutil.which("soffice") or shutil.which("libreoffice")


def _convert_with_libreoffice(docx_path: Path, pdf_path: Path) -> None:
    """Convert using LibreOffice headless."""
    docx_path = Path(docx_path)
    pdf_path = Path(pdf_path)
    outdir = pdf_path.parent
    outdir.mkdir(parents=True, exist_ok=True)

    cmd = [
        _loffice_available(),
        "--headless",
        "--nologo",
        "--nofirststartwizard",
        "--convert-to",
        "pdf",
        "--outdir",
        str(outdir),
        str(docx_path),
    ]
    # Run
    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    produced = outdir / (docx_path.stem + ".pdf")
    if not produced.exists():
        # LibreOffice sometimes produces .PDF
        produced = outdir / (docx_path.stem + ".PDF")
    if not produced.exists():
        raise RuntimeError("LibreOffice non ha prodotto il PDF atteso.")
    if produced != pdf_path:
        produced.replace(pdf_path)


def _convert_with_docx2pdf(docx_path: Path, pdf_path: Path) -> None:
    from docx2pdf import convert  # type: ignore

    pdf_path.parent.mkdir(parents=True, exist_ok=True)
    convert(str(docx_path), str(pdf_path))


def _convert_with_pymupdf(docx_path: Path, pdf_path: Path) -> None:
    doc = fitz.open(str(docx_path))
    pdf_bytes = doc.convert_to_pdf()
    pdf = fitz.open("pdf", pdf_bytes)
    pdf.save(str(pdf_path))
    pdf.close()
    doc.close()


def _drop_blank_pages(pdf_path: Path) -> int:
    """Remove visually blank pages (rare, but can happen with complex templates)."""
    pdf = fitz.open(str(pdf_path))
    blanks = []
    for i in range(pdf.page_count):
        page = pdf.load_page(i)
        pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5), alpha=False)
        samples = pix.samples
        nonwhite = 0
        total = len(samples) // 3
        for j in range(0, len(samples), 3):
            if samples[j] < 245 or samples[j + 1] < 245 or samples[j + 2] < 245:
                nonwhite += 1
        if total and (nonwhite / total) < 0.002:
            blanks.append(i)
    for i in reversed(blanks):
        pdf.delete_page(i)
    removed = len(blanks)
    if removed:
        tmp = pdf_path.with_suffix(".tmp.pdf")
        pdf.save(str(tmp))
        pdf.close()
        tmp.replace(pdf_path)
    else:
        pdf.close()
    return removed


def docx_to_pdf(docx_path: Path, pdf_path: Path, *, drop_blank_pages: bool = True) -> PdfConvertResult:
    docx_path = Path(docx_path)
    pdf_path = Path(pdf_path)

    backend = ""
    if _loffice_available():
        backend = "libreoffice"
        _convert_with_libreoffice(docx_path, pdf_path)
    else:
        # docx2pdf works only with MS Word
        try:
            backend = "docx2pdf"
            _convert_with_docx2pdf(docx_path, pdf_path)
        except Exception:
            backend = "pymupdf_fallback"
            _convert_with_pymupdf(docx_path, pdf_path)

    removed = 0
    if drop_blank_pages:
        try:
            removed = _drop_blank_pages(pdf_path)
        except Exception:
            removed = 0

    return PdfConvertResult(backend=backend, removed_blank_pages=removed)