"""DOCX -> PDF conversion (no Aspose). Obiettivo: ottenere un PDF *fedele* al template Word (grafica, tabelle, immagini, header/footer). Scelta converter: 1) LibreOffice headless (consigliato su Linux / HF Spaces) ✅ alta fedeltà 2) docx2pdf (solo Windows/macOS con MS Word) 3) fallback PyMuPDF (bassa fedeltà: solo come ultima spiaggia) Su HuggingFace Spaces puoi installare LibreOffice con `packages.txt`: - libreoffice - libreoffice-writer - fonts-dejavu-core (o altri font richiesti dal template) Questo non richiede Aspose. """ from __future__ import annotations import shutil import subprocess from dataclasses import dataclass from pathlib import Path from typing import Optional import fitz # PyMuPDF @dataclass class PdfConvertResult: backend: str removed_blank_pages: int = 0 def _loffice_available() -> Optional[str]: return shutil.which("soffice") or shutil.which("libreoffice") def _convert_with_libreoffice(docx_path: Path, pdf_path: Path) -> None: """Convert using LibreOffice headless.""" docx_path = Path(docx_path) pdf_path = Path(pdf_path) outdir = pdf_path.parent outdir.mkdir(parents=True, exist_ok=True) cmd = [ _loffice_available(), "--headless", "--nologo", "--nofirststartwizard", "--convert-to", "pdf", "--outdir", str(outdir), str(docx_path), ] # Run subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) produced = outdir / (docx_path.stem + ".pdf") if not produced.exists(): # LibreOffice sometimes produces .PDF produced = outdir / (docx_path.stem + ".PDF") if not produced.exists(): raise RuntimeError("LibreOffice non ha prodotto il PDF atteso.") if produced != pdf_path: produced.replace(pdf_path) def _convert_with_docx2pdf(docx_path: Path, pdf_path: Path) -> None: from docx2pdf import convert # type: ignore pdf_path.parent.mkdir(parents=True, exist_ok=True) convert(str(docx_path), str(pdf_path)) def _convert_with_pymupdf(docx_path: Path, pdf_path: Path) -> None: doc = fitz.open(str(docx_path)) pdf_bytes = doc.convert_to_pdf() pdf = fitz.open("pdf", pdf_bytes) pdf.save(str(pdf_path)) pdf.close() doc.close() def _drop_blank_pages(pdf_path: Path) -> int: """Remove visually blank pages (rare, but can happen with complex templates).""" pdf = fitz.open(str(pdf_path)) blanks = [] for i in range(pdf.page_count): page = pdf.load_page(i) pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5), alpha=False) samples = pix.samples nonwhite = 0 total = len(samples) // 3 for j in range(0, len(samples), 3): if samples[j] < 245 or samples[j + 1] < 245 or samples[j + 2] < 245: nonwhite += 1 if total and (nonwhite / total) < 0.002: blanks.append(i) for i in reversed(blanks): pdf.delete_page(i) removed = len(blanks) if removed: tmp = pdf_path.with_suffix(".tmp.pdf") pdf.save(str(tmp)) pdf.close() tmp.replace(pdf_path) else: pdf.close() return removed def docx_to_pdf(docx_path: Path, pdf_path: Path, *, drop_blank_pages: bool = True) -> PdfConvertResult: docx_path = Path(docx_path) pdf_path = Path(pdf_path) backend = "" if _loffice_available(): backend = "libreoffice" _convert_with_libreoffice(docx_path, pdf_path) else: # docx2pdf works only with MS Word try: backend = "docx2pdf" _convert_with_docx2pdf(docx_path, pdf_path) except Exception: backend = "pymupdf_fallback" _convert_with_pymupdf(docx_path, pdf_path) removed = 0 if drop_blank_pages: try: removed = _drop_blank_pages(pdf_path) except Exception: removed = 0 return PdfConvertResult(backend=backend, removed_blank_pages=removed)