MatteoScript's picture
Upload 6 files
7687049 verified
"""DOCX -> PDF conversion (no Aspose).
Obiettivo: ottenere un PDF *fedele* al template Word (grafica, tabelle, immagini, header/footer).
Scelta converter:
1) LibreOffice headless (consigliato su Linux / HF Spaces) ✅ alta fedeltà
2) docx2pdf (solo Windows/macOS con MS Word)
3) fallback PyMuPDF (bassa fedeltà: solo come ultima spiaggia)
Su HuggingFace Spaces puoi installare LibreOffice con `packages.txt`:
- libreoffice
- libreoffice-writer
- fonts-dejavu-core (o altri font richiesti dal template)
Questo non richiede Aspose.
"""
from __future__ import annotations
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import fitz # PyMuPDF
@dataclass
class PdfConvertResult:
backend: str
removed_blank_pages: int = 0
def _loffice_available() -> Optional[str]:
return shutil.which("soffice") or shutil.which("libreoffice")
def _convert_with_libreoffice(docx_path: Path, pdf_path: Path) -> None:
"""Convert using LibreOffice headless."""
docx_path = Path(docx_path)
pdf_path = Path(pdf_path)
outdir = pdf_path.parent
outdir.mkdir(parents=True, exist_ok=True)
cmd = [
_loffice_available(),
"--headless",
"--nologo",
"--nofirststartwizard",
"--convert-to",
"pdf",
"--outdir",
str(outdir),
str(docx_path),
]
# Run
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
produced = outdir / (docx_path.stem + ".pdf")
if not produced.exists():
# LibreOffice sometimes produces .PDF
produced = outdir / (docx_path.stem + ".PDF")
if not produced.exists():
raise RuntimeError("LibreOffice non ha prodotto il PDF atteso.")
if produced != pdf_path:
produced.replace(pdf_path)
def _convert_with_docx2pdf(docx_path: Path, pdf_path: Path) -> None:
from docx2pdf import convert # type: ignore
pdf_path.parent.mkdir(parents=True, exist_ok=True)
convert(str(docx_path), str(pdf_path))
def _convert_with_pymupdf(docx_path: Path, pdf_path: Path) -> None:
doc = fitz.open(str(docx_path))
pdf_bytes = doc.convert_to_pdf()
pdf = fitz.open("pdf", pdf_bytes)
pdf.save(str(pdf_path))
pdf.close()
doc.close()
def _drop_blank_pages(pdf_path: Path) -> int:
"""Remove visually blank pages (rare, but can happen with complex templates)."""
pdf = fitz.open(str(pdf_path))
blanks = []
for i in range(pdf.page_count):
page = pdf.load_page(i)
pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5), alpha=False)
samples = pix.samples
nonwhite = 0
total = len(samples) // 3
for j in range(0, len(samples), 3):
if samples[j] < 245 or samples[j + 1] < 245 or samples[j + 2] < 245:
nonwhite += 1
if total and (nonwhite / total) < 0.002:
blanks.append(i)
for i in reversed(blanks):
pdf.delete_page(i)
removed = len(blanks)
if removed:
tmp = pdf_path.with_suffix(".tmp.pdf")
pdf.save(str(tmp))
pdf.close()
tmp.replace(pdf_path)
else:
pdf.close()
return removed
def docx_to_pdf(docx_path: Path, pdf_path: Path, *, drop_blank_pages: bool = True) -> PdfConvertResult:
docx_path = Path(docx_path)
pdf_path = Path(pdf_path)
backend = ""
if _loffice_available():
backend = "libreoffice"
_convert_with_libreoffice(docx_path, pdf_path)
else:
# docx2pdf works only with MS Word
try:
backend = "docx2pdf"
_convert_with_docx2pdf(docx_path, pdf_path)
except Exception:
backend = "pymupdf_fallback"
_convert_with_pymupdf(docx_path, pdf_path)
removed = 0
if drop_blank_pages:
try:
removed = _drop_blank_pages(pdf_path)
except Exception:
removed = 0
return PdfConvertResult(backend=backend, removed_blank_pages=removed)