Spaces:

MatteoScript
/

BilancioCompetenze

Sleeping

App Files Files Community

BilancioCompetenze / src /reporter /pdf_convert.py

MatteoScript

Upload 6 files

7687049 verified 13 days ago

raw

history blame contribute delete

4.03 kB

	"""DOCX -> PDF conversion (no Aspose).

	Obiettivo: ottenere un PDF fedele al template Word (grafica, tabelle, immagini, header/footer).

	Scelta converter:
	1) LibreOffice headless (consigliato su Linux / HF Spaces) ✅ alta fedeltà
	2) docx2pdf (solo Windows/macOS con MS Word)
	3) fallback PyMuPDF (bassa fedeltà: solo come ultima spiaggia)

	Su HuggingFace Spaces puoi installare LibreOffice con `packages.txt`:
	- libreoffice
	- libreoffice-writer
	- fonts-dejavu-core (o altri font richiesti dal template)

	Questo non richiede Aspose.
	"""

	from __future__ import annotations

	import shutil
	import subprocess
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Optional

	import fitz # PyMuPDF


	@dataclass
	class PdfConvertResult:
	backend: str
	removed_blank_pages: int = 0


	def _loffice_available() -> Optional[str]:
	return shutil.which("soffice") or shutil.which("libreoffice")


	def _convert_with_libreoffice(docx_path: Path, pdf_path: Path) -> None:
	"""Convert using LibreOffice headless."""
	docx_path = Path(docx_path)
	pdf_path = Path(pdf_path)
	outdir = pdf_path.parent
	outdir.mkdir(parents=True, exist_ok=True)

	cmd = [
	_loffice_available(),
	"--headless",
	"--nologo",
	"--nofirststartwizard",
	"--convert-to",
	"pdf",
	"--outdir",
	str(outdir),
	str(docx_path),
	]
	# Run
	subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	produced = outdir / (docx_path.stem + ".pdf")
	if not produced.exists():
	# LibreOffice sometimes produces .PDF
	produced = outdir / (docx_path.stem + ".PDF")
	if not produced.exists():
	raise RuntimeError("LibreOffice non ha prodotto il PDF atteso.")
	if produced != pdf_path:
	produced.replace(pdf_path)


	def _convert_with_docx2pdf(docx_path: Path, pdf_path: Path) -> None:
	from docx2pdf import convert # type: ignore

	pdf_path.parent.mkdir(parents=True, exist_ok=True)
	convert(str(docx_path), str(pdf_path))


	def _convert_with_pymupdf(docx_path: Path, pdf_path: Path) -> None:
	doc = fitz.open(str(docx_path))
	pdf_bytes = doc.convert_to_pdf()
	pdf = fitz.open("pdf", pdf_bytes)
	pdf.save(str(pdf_path))
	pdf.close()
	doc.close()


	def _drop_blank_pages(pdf_path: Path) -> int:
	"""Remove visually blank pages (rare, but can happen with complex templates)."""
	pdf = fitz.open(str(pdf_path))
	blanks = []
	for i in range(pdf.page_count):
	page = pdf.load_page(i)
	pix = page.get_pixmap(matrix=fitz.Matrix(0.5, 0.5), alpha=False)
	samples = pix.samples
	nonwhite = 0
	total = len(samples) // 3
	for j in range(0, len(samples), 3):
	if samples[j] < 245 or samples[j + 1] < 245 or samples[j + 2] < 245:
	nonwhite += 1
	if total and (nonwhite / total) < 0.002:
	blanks.append(i)
	for i in reversed(blanks):
	pdf.delete_page(i)
	removed = len(blanks)
	if removed:
	tmp = pdf_path.with_suffix(".tmp.pdf")
	pdf.save(str(tmp))
	pdf.close()
	tmp.replace(pdf_path)
	else:
	pdf.close()
	return removed


	def docx_to_pdf(docx_path: Path, pdf_path: Path, *, drop_blank_pages: bool = True) -> PdfConvertResult:
	docx_path = Path(docx_path)
	pdf_path = Path(pdf_path)

	backend = ""
	if _loffice_available():
	backend = "libreoffice"
	_convert_with_libreoffice(docx_path, pdf_path)
	else:
	# docx2pdf works only with MS Word
	try:
	backend = "docx2pdf"
	_convert_with_docx2pdf(docx_path, pdf_path)
	except Exception:
	backend = "pymupdf_fallback"
	_convert_with_pymupdf(docx_path, pdf_path)

	removed = 0
	if drop_blank_pages:
	try:
	removed = _drop_blank_pages(pdf_path)
	except Exception:
	removed = 0

	return PdfConvertResult(backend=backend, removed_blank_pages=removed)