Spaces:
Sleeping
Sleeping
| """DOCX -> PDF conversion (no Aspose). | |
| Obiettivo: ottenere un PDF *fedele* al template Word (grafica, tabelle, immagini, header/footer). | |
| Scelta converter: | |
| 1) LibreOffice headless (consigliato su Linux / HF Spaces) ✅ alta fedeltà | |
| 2) docx2pdf (solo Windows/macOS con MS Word) | |
| 3) fallback PyMuPDF (bassa fedeltà: solo come ultima spiaggia) | |
| Su HuggingFace Spaces puoi installare LibreOffice con `packages.txt`: | |
| - libreoffice | |
| - libreoffice-writer | |
| - fonts-dejavu-core (o altri font richiesti dal template) | |
| Questo non richiede Aspose. | |
| """ | |
| from __future__ import annotations | |
| import shutil | |
| import subprocess | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Optional | |
| import fitz # PyMuPDF | |
| class PdfConvertResult: | |
| backend: str | |
| removed_blank_pages: int = 0 | |
| def _loffice_available() -> Optional[str]: | |
| return shutil.which("soffice") or shutil.which("libreoffice") | |
| def _convert_with_libreoffice(docx_path: Path, pdf_path: Path) -> None: | |
| """Convert using LibreOffice headless.""" | |
| docx_path = Path(docx_path) | |
| pdf_path = Path(pdf_path) | |
| outdir = pdf_path.parent | |
| outdir.mkdir(parents=True, exist_ok=True) | |
| cmd = [ | |
| _loffice_available(), | |
| "--headless", | |
| "--nologo", | |
| "--nofirststartwizard", | |
| "--convert-to", | |
| "pdf", | |
| "--outdir", | |
| str(outdir), | |
| str(docx_path), | |
| ] | |
| # Run | |
| subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| produced = outdir / (docx_path.stem + ".pdf") | |
| if not produced.exists(): | |
| # LibreOffice sometimes produces .PDF | |
| produced = outdir / (docx_path.stem + ".PDF") | |
| if not produced.exists(): | |
| raise RuntimeError("LibreOffice non ha prodotto il PDF atteso.") | |
| if produced != pdf_path: | |
| produced.replace(pdf_path) | |
| def _convert_with_docx2pdf(docx_path: Path, pdf_path: Path) -> None: | |
| from docx2pdf import convert # type: ignore | |
| pdf_path.parent.mkdir(parents=True, exist_ok=True) | |
| convert(str(docx_path), str(pdf_path)) | |
| def _convert_with_pymupdf(docx_path: Path, pdf_path: Path) -> None: | |
| doc = fitz.open(str(docx_path)) | |
| pdf_bytes = doc.convert_to_pdf() | |
| pdf = fitz.open("pdf", pdf_bytes) | |
| pdf.save(str(pdf_path)) | |
| pdf.close() | |
| doc.close() | |
| def _drop_blank_pages(pdf_path: Path) -> int: | |
| """Remove visually blank pages (rare, but can happen with complex templates).""" | |
| pdf = fitz.open(str(pdf_path)) | |
| blanks = [] | |
| for i in range(pdf.page_count): | |
| page = pdf.load_page(i) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5), alpha=False) | |
| samples = pix.samples | |
| nonwhite = 0 | |
| total = len(samples) // 3 | |
| for j in range(0, len(samples), 3): | |
| if samples[j] < 245 or samples[j + 1] < 245 or samples[j + 2] < 245: | |
| nonwhite += 1 | |
| if total and (nonwhite / total) < 0.002: | |
| blanks.append(i) | |
| for i in reversed(blanks): | |
| pdf.delete_page(i) | |
| removed = len(blanks) | |
| if removed: | |
| tmp = pdf_path.with_suffix(".tmp.pdf") | |
| pdf.save(str(tmp)) | |
| pdf.close() | |
| tmp.replace(pdf_path) | |
| else: | |
| pdf.close() | |
| return removed | |
| def docx_to_pdf(docx_path: Path, pdf_path: Path, *, drop_blank_pages: bool = True) -> PdfConvertResult: | |
| docx_path = Path(docx_path) | |
| pdf_path = Path(pdf_path) | |
| backend = "" | |
| if _loffice_available(): | |
| backend = "libreoffice" | |
| _convert_with_libreoffice(docx_path, pdf_path) | |
| else: | |
| # docx2pdf works only with MS Word | |
| try: | |
| backend = "docx2pdf" | |
| _convert_with_docx2pdf(docx_path, pdf_path) | |
| except Exception: | |
| backend = "pymupdf_fallback" | |
| _convert_with_pymupdf(docx_path, pdf_path) | |
| removed = 0 | |
| if drop_blank_pages: | |
| try: | |
| removed = _drop_blank_pages(pdf_path) | |
| except Exception: | |
| removed = 0 | |
| return PdfConvertResult(backend=backend, removed_blank_pages=removed) | |