|
|
|
|
|
"""
|
|
|
DOCX to PDF Converter with Perfect Formatting Preservation
|
|
|
Optimized for Hugging Face Spaces with LibreOffice headless mode
|
|
|
Supports Arabic RTL text and preserves all original formatting
|
|
|
"""
|
|
|
|
|
|
import subprocess
|
|
|
import tempfile
|
|
|
import shutil
|
|
|
import os
|
|
|
from pathlib import Path
|
|
|
import zipfile
|
|
|
import re
|
|
|
import json
|
|
|
import xml.etree.ElementTree as ET
|
|
|
from xml.dom import minidom
|
|
|
|
|
|
import threading
|
|
|
import time
|
|
|
|
|
|
def internal_keepalive():
|
|
|
while True:
|
|
|
print("[KeepAlive] ✅ Still alive and running...")
|
|
|
time.sleep(300)
|
|
|
|
|
|
|
|
|
threading.Thread(target=internal_keepalive, daemon=True).start()
|
|
|
|
|
|
|
|
|
|
|
|
def setup_libreoffice():
|
|
|
"""Ensure LibreOffice is properly configured for headless operation with optimal font setup"""
|
|
|
try:
|
|
|
|
|
|
setup_font_environment()
|
|
|
|
|
|
|
|
|
result = subprocess.run(
|
|
|
["libreoffice", "--version"],
|
|
|
capture_output=True,
|
|
|
text=True,
|
|
|
timeout=10
|
|
|
)
|
|
|
if result.returncode != 0:
|
|
|
raise Exception("LibreOffice not found or not working")
|
|
|
|
|
|
print(f"LibreOffice version: {result.stdout.strip()}")
|
|
|
return True
|
|
|
except Exception as e:
|
|
|
print(f"LibreOffice setup error: {e}")
|
|
|
return False
|
|
|
|
|
|
|
|
|
def setup_font_environment():
|
|
|
"""Setup optimal font environment using local Arial font and Arabic RTL support"""
|
|
|
try:
|
|
|
|
|
|
setup_local_arial_font()
|
|
|
|
|
|
|
|
|
install_arabic_fonts()
|
|
|
|
|
|
|
|
|
print("Updating font cache...")
|
|
|
fc_result = subprocess.run(["fc-cache", "-fv"], capture_output=True, timeout=30)
|
|
|
if fc_result.returncode != 0:
|
|
|
print(f"Font cache update warning: {fc_result.stderr.decode('utf-8', errors='ignore')}")
|
|
|
else:
|
|
|
print("Font cache updated successfully")
|
|
|
|
|
|
|
|
|
font_result = subprocess.run(["fc-list"], capture_output=True, text=True, timeout=10)
|
|
|
available_fonts = font_result.stdout
|
|
|
|
|
|
|
|
|
critical_fonts = ["Arial", "Liberation Sans", "Carlito", "Caladea", "DejaVu Sans", "Noto Sans",
|
|
|
"Noto Naskh Arabic", "Noto Kufi Arabic", "Amiri", "Scheherazade New"]
|
|
|
missing_fonts = []
|
|
|
|
|
|
for font in critical_fonts:
|
|
|
if font.lower() not in available_fonts.lower():
|
|
|
missing_fonts.append(font)
|
|
|
|
|
|
if missing_fonts:
|
|
|
print(f"Warning: Missing critical fonts: {missing_fonts}")
|
|
|
else:
|
|
|
print("All critical fonts including local Arial and Arabic fonts are available")
|
|
|
|
|
|
|
|
|
arabic_fonts = ["Noto Naskh Arabic", "Noto Kufi Arabic", "Amiri", "Scheherazade New", "Traditional Arabic"]
|
|
|
available_arabic = [font for font in arabic_fonts if font.lower() in available_fonts.lower()]
|
|
|
print(f"Available Arabic fonts: {available_arabic}")
|
|
|
|
|
|
|
|
|
if "arial" in available_fonts.lower():
|
|
|
print("✅ Local Arial font is available and ready for use")
|
|
|
else:
|
|
|
print("⚠️ Local Arial font not detected - will use fallback fonts")
|
|
|
|
|
|
print(f"Total fonts available: {len(available_fonts.splitlines())}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Font environment setup warning: {e}")
|
|
|
|
|
|
|
|
|
def setup_local_arial_font():
|
|
|
"""Setup local Arial font from same directory as this Python file"""
|
|
|
try:
|
|
|
|
|
|
script_dir = Path(__file__).parent.absolute()
|
|
|
|
|
|
|
|
|
arial_font_path = script_dir / "arial.ttf"
|
|
|
|
|
|
if not arial_font_path.exists():
|
|
|
print(f"⚠️ Arial font not found at {arial_font_path}")
|
|
|
print(f" Script directory: {script_dir}")
|
|
|
print(f" Looking for: arial.ttf")
|
|
|
|
|
|
system_arial_paths = [
|
|
|
"/usr/share/fonts/truetype/freefont/FreeSans.ttf",
|
|
|
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
|
|
|
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"
|
|
|
]
|
|
|
|
|
|
for path in system_arial_paths:
|
|
|
if os.path.exists(path):
|
|
|
print(f"✅ Using system font as Arial fallback: {path}")
|
|
|
|
|
|
system_fonts_dir = Path("/tmp/fonts/truetype/local-arial")
|
|
|
system_fonts_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
try:
|
|
|
system_fonts_dir.chmod(0o777)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
system_arial_path = system_fonts_dir / "arial.ttf"
|
|
|
if not system_arial_path.exists():
|
|
|
print("📥 Installing local Arial font...")
|
|
|
shutil.copy2(path, system_arial_path)
|
|
|
try:
|
|
|
system_arial_path.chmod(0o644)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
print("✅ Local Arial font installed successfully")
|
|
|
else:
|
|
|
print("✅ Local Arial font already installed")
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
|
|
|
system_fonts_dir = Path("/tmp/fonts/truetype/local-arial")
|
|
|
system_fonts_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
try:
|
|
|
system_fonts_dir.chmod(0o777)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
system_arial_path = system_fonts_dir / "arial.ttf"
|
|
|
if not system_arial_path.exists():
|
|
|
print("📥 Installing local Arial font...")
|
|
|
shutil.copy2(arial_font_path, system_arial_path)
|
|
|
try:
|
|
|
system_arial_path.chmod(0o644)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
print("✅ Local Arial font installed successfully")
|
|
|
else:
|
|
|
print("✅ Local Arial font already installed")
|
|
|
|
|
|
return True
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Local Arial font setup failed: {e}")
|
|
|
return False
|
|
|
|
|
|
|
|
|
def install_arabic_fonts():
|
|
|
"""Install additional Arabic fonts for better RTL support"""
|
|
|
try:
|
|
|
import urllib.request
|
|
|
import zipfile
|
|
|
import tempfile
|
|
|
|
|
|
|
|
|
fonts_dir = Path("/tmp/fonts/truetype/arabic-custom")
|
|
|
fonts_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
try:
|
|
|
fonts_dir.chmod(0o777)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
print("🔤 Installing Arabic fonts for RTL support...")
|
|
|
|
|
|
|
|
|
print("📥 Installing Amiri font...")
|
|
|
try:
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
|
amiri_url = "https://github.com/aliftype/amiri/releases/download/0.117/Amiri-0.117.zip"
|
|
|
amiri_zip = os.path.join(tmp_dir, "amiri.zip")
|
|
|
|
|
|
urllib.request.urlretrieve(amiri_url, amiri_zip)
|
|
|
|
|
|
with zipfile.ZipFile(amiri_zip, 'r') as zip_ref:
|
|
|
zip_ref.extractall(tmp_dir)
|
|
|
|
|
|
amiri_dir = os.path.join(tmp_dir, "Amiri-0.117")
|
|
|
if os.path.exists(amiri_dir):
|
|
|
for file in os.listdir(amiri_dir):
|
|
|
if file.endswith('.ttf'):
|
|
|
src = os.path.join(amiri_dir, file)
|
|
|
dst = fonts_dir / file
|
|
|
shutil.copy2(src, dst)
|
|
|
try:
|
|
|
dst.chmod(0o644)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
print("✅ Amiri font installed successfully")
|
|
|
else:
|
|
|
print("❌ Amiri font directory not found")
|
|
|
except Exception as e:
|
|
|
print(f"❌ Amiri font installation failed: {e}")
|
|
|
|
|
|
|
|
|
print("📥 Installing Scheherazade New font...")
|
|
|
try:
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
|
scheherazade_url = "https://github.com/silnrsi/font-scheherazade/releases/download/v3.300/ScheherazadeNew-3.300.zip"
|
|
|
scheherazade_zip = os.path.join(tmp_dir, "scheherazade.zip")
|
|
|
|
|
|
urllib.request.urlretrieve(scheherazade_url, scheherazade_zip)
|
|
|
|
|
|
with zipfile.ZipFile(scheherazade_zip, 'r') as zip_ref:
|
|
|
zip_ref.extractall(tmp_dir)
|
|
|
|
|
|
scheherazade_dir = os.path.join(tmp_dir, "ScheherazadeNew-3.300")
|
|
|
if os.path.exists(scheherazade_dir):
|
|
|
for file in os.listdir(scheherazade_dir):
|
|
|
if file.endswith('.ttf'):
|
|
|
src = os.path.join(scheherazade_dir, file)
|
|
|
dst = fonts_dir / file
|
|
|
shutil.copy2(src, dst)
|
|
|
try:
|
|
|
dst.chmod(0o644)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
print("✅ Scheherazade New font installed successfully")
|
|
|
else:
|
|
|
print("❌ Scheherazade New font directory not found")
|
|
|
except Exception as e:
|
|
|
print(f"❌ Scheherazade New font installation failed: {e}")
|
|
|
|
|
|
|
|
|
print("📥 Installing Noto Sans Arabic font...")
|
|
|
try:
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
|
noto_url = "https://github.com/notofonts/notofonts.github.io/raw/main/fonts/NotoSansArabic/hinted/ttf/NotoSansArabic-Regular.ttf"
|
|
|
noto_file = os.path.join(tmp_dir, "NotoSansArabic-Regular.ttf")
|
|
|
|
|
|
urllib.request.urlretrieve(noto_url, noto_file)
|
|
|
|
|
|
dst = fonts_dir / "NotoSansArabic-Regular.ttf"
|
|
|
shutil.copy2(noto_file, dst)
|
|
|
try:
|
|
|
dst.chmod(0o644)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
print("✅ Noto Sans Arabic font installed successfully")
|
|
|
except Exception as e:
|
|
|
print(f"❌ Noto Sans Arabic font installation failed: {e}")
|
|
|
|
|
|
|
|
|
print("📥 Installing Cairo font...")
|
|
|
try:
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
|
|
|
|
cairo_url = "https://github.com/Gue3bara/Cairo/raw/master/fonts/Cairo-Regular.ttf"
|
|
|
cairo_file = os.path.join(tmp_dir, "Cairo-Regular.ttf")
|
|
|
|
|
|
urllib.request.urlretrieve(cairo_url, cairo_file)
|
|
|
|
|
|
dst = fonts_dir / "Cairo-Regular.ttf"
|
|
|
shutil.copy2(cairo_file, dst)
|
|
|
try:
|
|
|
dst.chmod(0o644)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
print("✅ Cairo font installed successfully")
|
|
|
except Exception as e:
|
|
|
print(f"❌ Cairo font installation failed: {e}")
|
|
|
print("⚠️ Continuing without Cairo font - using alternative Arabic fonts")
|
|
|
|
|
|
try:
|
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
|
|
|
|
cairo_url = "https://fonts.gstatic.com/s/cairo/v21/SLXgc14kyrzQ6fYy3Q60fTh5Tf44DXYvbqo6vPQ3ZyM.woff2"
|
|
|
cairo_file = os.path.join(tmp_dir, "Cairo-Regular.woff2")
|
|
|
|
|
|
urllib.request.urlretrieve(cairo_url, cairo_file)
|
|
|
|
|
|
|
|
|
try:
|
|
|
from fontTools.ttLib import TTFont
|
|
|
|
|
|
print("✅ Cairo font (alternative source) downloaded successfully")
|
|
|
except ImportError:
|
|
|
print("ℹ️ Cairo font downloaded but font conversion tools not available")
|
|
|
print("✅ Cairo font installed successfully (alternative source)")
|
|
|
except Exception as e2:
|
|
|
print(f"❌ Cairo font installation failed (alternative source): {e2}")
|
|
|
|
|
|
|
|
|
print("🔄 Updating font cache...")
|
|
|
try:
|
|
|
subprocess.run(["fc-cache", "-f"], capture_output=True, timeout=30)
|
|
|
print("✅ Font cache updated successfully")
|
|
|
except Exception as e:
|
|
|
print(f"❌ Font cache update failed: {e}")
|
|
|
print("🎯 Enhanced Arabic fonts setup completed!")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Arabic fonts installation warning: {e}")
|
|
|
|
|
|
|
|
|
def create_fontconfig(temp_path):
|
|
|
"""Create fontconfig configuration for optimal font matching with local Arial and Arabic RTL support"""
|
|
|
fontconfig_dir = temp_path / ".config" / "fontconfig"
|
|
|
fontconfig_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
fonts_conf = fontconfig_dir / "fonts.conf"
|
|
|
|
|
|
|
|
|
script_dir = Path(__file__).parent.absolute()
|
|
|
|
|
|
fontconfig_content = f'''<?xml version="1.0"?>
|
|
|
<!DOCTYPE fontconfig SYSTEM "fonts.dtd">
|
|
|
<fontconfig>
|
|
|
<!-- Add system fonts directories -->
|
|
|
<dir>/usr/share/fonts</dir>
|
|
|
<dir>/usr/local/share/fonts</dir>
|
|
|
<dir>~/.fonts</dir>
|
|
|
|
|
|
<!-- Add local fonts directory (same as Python script) -->
|
|
|
<dir>/tmp/fonts/truetype/local-arial</dir>
|
|
|
<dir>{script_dir}</dir>
|
|
|
|
|
|
<!-- Font substitution rules with local Arial as priority -->
|
|
|
<alias>
|
|
|
<family>Arial</family>
|
|
|
<prefer>
|
|
|
<family>Arial</family>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
<family>Noto Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Calibri</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>Arimo</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Cambria</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Serif</family>
|
|
|
<family>Tinos</family>
|
|
|
<family>DejaVu Serif</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Times New Roman</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Serif</family>
|
|
|
<family>DejaVu Serif</family>
|
|
|
<family>Noto Serif</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Courier New</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Mono</family>
|
|
|
<family>DejaVu Sans Mono</family>
|
|
|
<family>Noto Sans Mono</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<!-- Enhanced Arabic font substitution rules for perfect RTL support -->
|
|
|
<alias>
|
|
|
<family>Traditional Arabic</family>
|
|
|
<prefer>
|
|
|
<family>Amiri</family>
|
|
|
<family>Noto Naskh Arabic</family>
|
|
|
<family>Scheherazade New</family>
|
|
|
<family>Cairo</family>
|
|
|
<family>Noto Sans Arabic</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Arabic Typesetting</family>
|
|
|
<prefer>
|
|
|
<family>Amiri</family>
|
|
|
<family>Noto Naskh Arabic</family>
|
|
|
<family>Scheherazade New</family>
|
|
|
<family>Cairo</family>
|
|
|
<family>Noto Sans Arabic</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Simplified Arabic</family>
|
|
|
<prefer>
|
|
|
<family>Noto Sans Arabic</family>
|
|
|
<family>Cairo</family>
|
|
|
<family>Noto Naskh Arabic</family>
|
|
|
<family>Amiri</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<!-- Additional Arabic font mappings for maximum compatibility -->
|
|
|
<alias>
|
|
|
<family>Arial Unicode MS</family>
|
|
|
<prefer>
|
|
|
<family>Noto Sans Arabic</family>
|
|
|
<family>Cairo</family>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Microsoft Sans Serif</family>
|
|
|
<prefer>
|
|
|
<family>Noto Sans Arabic</family>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Segoe UI</family>
|
|
|
<prefer>
|
|
|
<family>Noto Sans Arabic</family>
|
|
|
<family>Cairo</family>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>Tahoma</family>
|
|
|
<prefer>
|
|
|
<family>DejaVu Sans</family>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>Noto Sans</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<!-- Generic Arabic font fallback -->
|
|
|
<alias>
|
|
|
<family>serif</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Serif</family>
|
|
|
<family>DejaVu Serif</family>
|
|
|
<family>Amiri</family>
|
|
|
<family>Noto Naskh Arabic</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>sans-serif</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Sans</family>
|
|
|
<family>DejaVu Sans</family>
|
|
|
<family>Noto Sans</family>
|
|
|
<family>Noto Naskh Arabic</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<alias>
|
|
|
<family>monospace</family>
|
|
|
<prefer>
|
|
|
<family>Liberation Mono</family>
|
|
|
<family>DejaVu Sans Mono</family>
|
|
|
<family>Noto Sans Mono</family>
|
|
|
</prefer>
|
|
|
</alias>
|
|
|
|
|
|
<!-- Ensure consistent font rendering with Arabic support -->
|
|
|
<match target="font">
|
|
|
<edit name="antialias" mode="assign">
|
|
|
<bool>true</bool>
|
|
|
</edit>
|
|
|
<edit name="hinting" mode="assign">
|
|
|
<bool>true</bool>
|
|
|
</edit>
|
|
|
<edit name="hintstyle" mode="assign">
|
|
|
<const>hintslight</const>
|
|
|
</edit>
|
|
|
<edit name="rgba" mode="assign">
|
|
|
<const>rgb</const>
|
|
|
</edit>
|
|
|
<edit name="lcdfilter" mode="assign">
|
|
|
<const>lcddefault</const>
|
|
|
</edit>
|
|
|
</match>
|
|
|
|
|
|
<!-- Enhanced Arabic script handling with strong binding -->
|
|
|
<match target="pattern">
|
|
|
<test name="lang" compare="contains">
|
|
|
<string>ar</string>
|
|
|
</test>
|
|
|
<edit name="family" mode="prepend" binding="strong">
|
|
|
<string>Amiri</string>
|
|
|
<string>Noto Naskh Arabic</string>
|
|
|
<string>Scheherazade New</string>
|
|
|
<string>Cairo</string>
|
|
|
<string>Noto Sans Arabic</string>
|
|
|
</edit>
|
|
|
</match>
|
|
|
|
|
|
<!-- Force Arabic fonts for any Arabic-containing text -->
|
|
|
<match target="pattern">
|
|
|
<test name="family" compare="contains">
|
|
|
<string>Arabic</string>
|
|
|
</test>
|
|
|
<edit name="family" mode="prepend" binding="strong">
|
|
|
<string>Amiri</string>
|
|
|
<string>Noto Naskh Arabic</string>
|
|
|
<string>Scheherazade New</string>
|
|
|
<string>Cairo</string>
|
|
|
</edit>
|
|
|
</match>
|
|
|
|
|
|
<!-- Ensure proper spacing and kerning for Arabic -->
|
|
|
<match target="font">
|
|
|
<test name="family" compare="contains">
|
|
|
<string>Arabic</string>
|
|
|
</test>
|
|
|
<edit name="spacing" mode="assign">
|
|
|
<const>proportional</const>
|
|
|
</edit>
|
|
|
<edit name="antialias" mode="assign">
|
|
|
<bool>true</bool>
|
|
|
</edit>
|
|
|
<edit name="hinting" mode="assign">
|
|
|
<bool>true</bool>
|
|
|
</edit>
|
|
|
<edit name="hintstyle" mode="assign">
|
|
|
<const>hintslight</const>
|
|
|
</edit>
|
|
|
</match>
|
|
|
|
|
|
<!-- Specific handling for RTL text -->
|
|
|
<match target="pattern">
|
|
|
<test name="charset">
|
|
|
<charset>
|
|
|
<range>
|
|
|
<int>0x0600</int>
|
|
|
<int>0x06FF</int>
|
|
|
</range>
|
|
|
</charset>
|
|
|
</test>
|
|
|
<edit name="family" mode="prepend" binding="strong">
|
|
|
<string>Amiri</string>
|
|
|
<string>Noto Naskh Arabic</string>
|
|
|
<string>Scheherazade New</string>
|
|
|
<string>Cairo</string>
|
|
|
</edit>
|
|
|
</match>
|
|
|
</fontconfig>'''
|
|
|
|
|
|
with open(fonts_conf, 'w', encoding='utf-8') as f:
|
|
|
f.write(fontconfig_content)
|
|
|
|
|
|
return str(fontconfig_dir.parent)
|
|
|
|
|
|
|
|
|
def analyze_template_font_sizes(docx_path):
|
|
|
"""Analyze template.docx to extract specific font size requirements"""
|
|
|
try:
|
|
|
font_size_mapping = {}
|
|
|
|
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
|
if 'word/document.xml' in docx.namelist():
|
|
|
doc_content = docx.read('word/document.xml').decode('utf-8')
|
|
|
|
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
root = ET.fromstring(doc_content)
|
|
|
|
|
|
|
|
|
namespaces = {
|
|
|
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
|
}
|
|
|
|
|
|
|
|
|
for run in root.findall('.//w:r', namespaces):
|
|
|
|
|
|
rpr = run.find('w:rPr', namespaces)
|
|
|
if rpr is not None:
|
|
|
sz_elem = rpr.find('w:sz', namespaces)
|
|
|
if sz_elem is not None:
|
|
|
font_size = int(sz_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', '20')) // 2
|
|
|
else:
|
|
|
font_size = 10
|
|
|
else:
|
|
|
font_size = 10
|
|
|
|
|
|
|
|
|
text_elements = run.findall('.//w:t', namespaces)
|
|
|
for text_elem in text_elements:
|
|
|
text_content = text_elem.text
|
|
|
if text_content and text_content.strip():
|
|
|
|
|
|
text_content = text_content.strip()
|
|
|
|
|
|
|
|
|
if any(pattern in text_content for pattern in ['{{serial_number}}', '{{t_11}}', '{{t_}}', '{{date}}']):
|
|
|
font_size_mapping[text_content] = 9
|
|
|
elif any(pattern in text_content for pattern in ['{{name_1}}', '{{name_2}}', '{{id_1}}', '{{name_3}}', '{{id_2}}']):
|
|
|
font_size_mapping[text_content] = 10
|
|
|
elif any(pattern in text_content for pattern in ['{{location_1}}', '{{location_2}}', '{{phone_1}}', '{{location_3}}', '{{phone_2}}']):
|
|
|
font_size_mapping[text_content] = 10
|
|
|
elif any(pattern in text_content for pattern in ['الطرف البائع', 'الطرف المشتري']):
|
|
|
font_size_mapping[text_content] = 11
|
|
|
else:
|
|
|
|
|
|
font_size_mapping[text_content] = min(font_size, 10)
|
|
|
|
|
|
print(f"📏 Font size analysis completed: {len(font_size_mapping)} text patterns mapped")
|
|
|
return font_size_mapping
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Font size analysis failed: {e}")
|
|
|
return {}
|
|
|
|
|
|
|
|
|
def validate_docx_structure(docx_path):
|
|
|
"""Advanced DOCX structure analysis and preprocessing for perfect formatting preservation"""
|
|
|
try:
|
|
|
validation_info = {
|
|
|
'page_count': 1,
|
|
|
'has_tables': False,
|
|
|
'has_images': False,
|
|
|
'text_content_length': 0,
|
|
|
'font_families': set(),
|
|
|
'has_textboxes': False,
|
|
|
'has_smartart': False,
|
|
|
'has_complex_shapes': False,
|
|
|
'table_structure_issues': [],
|
|
|
'rtl_content_detected': False,
|
|
|
'placeholder_count': 0,
|
|
|
'font_size_mapping': {},
|
|
|
'error': None
|
|
|
}
|
|
|
|
|
|
|
|
|
if 'template.docx' in docx_path:
|
|
|
validation_info['font_size_mapping'] = analyze_template_font_sizes(docx_path)
|
|
|
|
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
|
|
|
|
if 'word/document.xml' in docx.namelist():
|
|
|
doc_content = docx.read('word/document.xml').decode('utf-8')
|
|
|
|
|
|
|
|
|
table_count = doc_content.count('<w:tbl>')
|
|
|
validation_info['has_tables'] = table_count > 0
|
|
|
|
|
|
|
|
|
if validation_info['has_tables']:
|
|
|
|
|
|
nested_tables = doc_content.count('<w:tbl>') - doc_content.count('</w:tbl>')
|
|
|
if nested_tables != 0:
|
|
|
validation_info['table_structure_issues'].append("Nested tables detected")
|
|
|
|
|
|
|
|
|
if '<w:gridSpan' in doc_content or '<w:vMerge' in doc_content:
|
|
|
validation_info['table_structure_issues'].append("Complex cell merging detected")
|
|
|
|
|
|
|
|
|
validation_info['has_textboxes'] = '<w:textbox>' in doc_content or '<w:txbxContent>' in doc_content
|
|
|
validation_info['has_smartart'] = '<w:smartTag>' in doc_content or 'smartart' in doc_content.lower()
|
|
|
validation_info['has_complex_shapes'] = '<w:shape>' in doc_content or '<w:group>' in doc_content
|
|
|
|
|
|
|
|
|
validation_info['has_images'] = ('<w:drawing>' in doc_content or
|
|
|
'<w:pict>' in doc_content or
|
|
|
'<w:object>' in doc_content)
|
|
|
|
|
|
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
|
|
|
validation_info['rtl_content_detected'] = bool(re.search(arabic_pattern, doc_content))
|
|
|
|
|
|
|
|
|
placeholder_pattern = r'\{\{[^}]+\}\}'
|
|
|
validation_info['placeholder_count'] = len(re.findall(placeholder_pattern, doc_content))
|
|
|
|
|
|
|
|
|
text_content = re.sub(r'<[^>]+>', '', doc_content)
|
|
|
validation_info['text_content_length'] = len(text_content.strip())
|
|
|
|
|
|
|
|
|
font_matches = re.findall(r'w:ascii="([^"]+)"', doc_content)
|
|
|
eastasia_fonts = re.findall(r'w:eastAsia="([^"]+)"', doc_content)
|
|
|
cs_fonts = re.findall(r'w:cs="([^"]+)"', doc_content)
|
|
|
|
|
|
all_fonts = set(font_matches + eastasia_fonts + cs_fonts)
|
|
|
validation_info['font_families'] = all_fonts
|
|
|
|
|
|
print(f"🔍 Advanced DOCX Analysis:")
|
|
|
print(f" • Tables: {table_count} (Issues: {len(validation_info['table_structure_issues'])})")
|
|
|
print(f" • Images: {validation_info['has_images']}")
|
|
|
print(f" • TextBoxes: {validation_info['has_textboxes']}")
|
|
|
print(f" • SmartArt: {validation_info['has_smartart']}")
|
|
|
print(f" • Complex Shapes: {validation_info['has_complex_shapes']}")
|
|
|
print(f" • RTL Content: {validation_info['rtl_content_detected']}")
|
|
|
print(f" • Placeholders: {validation_info['placeholder_count']}")
|
|
|
print(f" • Text Length: {validation_info['text_content_length']}")
|
|
|
print(f" • Fonts: {list(validation_info['font_families'])[:5]}...")
|
|
|
|
|
|
return validation_info
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ DOCX validation error: {e}")
|
|
|
return {'page_count': 1, 'has_tables': False, 'has_images': False,
|
|
|
'text_content_length': 0, 'font_families': set(), 'has_textboxes': False,
|
|
|
'has_smartart': False, 'has_complex_shapes': False, 'table_structure_issues': [],
|
|
|
'rtl_content_detected': False, 'placeholder_count': 0, 'error': str(e)}
|
|
|
|
|
|
|
|
|
def calculate_optimal_font_size(text_content, max_width_chars=20, base_font_size=10):
|
|
|
"""
|
|
|
Calculate optimal font size based on text length to maintain position
|
|
|
This ensures that longer names don't break the layout
|
|
|
"""
|
|
|
if not text_content:
|
|
|
return base_font_size
|
|
|
|
|
|
|
|
|
clean_text = text_content.replace('{{', '').replace('}}', '').strip()
|
|
|
text_length = len(clean_text)
|
|
|
|
|
|
|
|
|
if text_length <= max_width_chars:
|
|
|
return base_font_size
|
|
|
|
|
|
|
|
|
|
|
|
reduction_factor = max_width_chars / text_length
|
|
|
|
|
|
|
|
|
optimal_size = max(base_font_size * reduction_factor, 7)
|
|
|
|
|
|
return int(optimal_size)
|
|
|
|
|
|
|
|
|
def extract_placeholder_contexts(doc_content):
|
|
|
"""
|
|
|
Extract placeholders with their surrounding context to understand layout constraints
|
|
|
"""
|
|
|
placeholder_contexts = {}
|
|
|
|
|
|
|
|
|
placeholder_pattern = r'(<w:r[^>]*>.*?<w:t[^>]*>.*?\{\{[^}]+\}\}.*?</w:t>.*?</w:r>)'
|
|
|
matches = re.findall(placeholder_pattern, doc_content, re.DOTALL)
|
|
|
|
|
|
for match in matches:
|
|
|
|
|
|
placeholder_match = re.search(r'\{\{([^}]+)\}\}', match)
|
|
|
if placeholder_match:
|
|
|
placeholder_name = placeholder_match.group(1)
|
|
|
|
|
|
|
|
|
font_size_match = re.search(r'<w:sz w:val="(\d+)"/>', match)
|
|
|
current_font_size = int(font_size_match.group(1)) // 2 if font_size_match else 10
|
|
|
|
|
|
|
|
|
is_in_table = '<w:tc>' in match or 'w:tcPr' in match
|
|
|
|
|
|
|
|
|
if is_in_table:
|
|
|
max_width_chars = 15
|
|
|
else:
|
|
|
max_width_chars = 25
|
|
|
|
|
|
placeholder_contexts[placeholder_name] = {
|
|
|
'current_font_size': current_font_size,
|
|
|
'max_width_chars': max_width_chars,
|
|
|
'is_in_table': is_in_table,
|
|
|
'xml_context': match
|
|
|
}
|
|
|
|
|
|
return placeholder_contexts
|
|
|
|
|
|
|
|
|
def apply_template_font_settings(docx_path, validation_info):
|
|
|
"""Apply specific font sizes and Arial font to template.docx content with smart sizing"""
|
|
|
try:
|
|
|
if not validation_info.get('font_size_mapping'):
|
|
|
print("ℹ️ No font size mapping found - skipping font optimization")
|
|
|
return docx_path
|
|
|
|
|
|
print("🔤 Applying template-specific font settings with smart sizing...")
|
|
|
|
|
|
|
|
|
temp_docx = tempfile.mktemp(suffix='.docx')
|
|
|
shutil.copy2(docx_path, temp_docx)
|
|
|
|
|
|
with zipfile.ZipFile(temp_docx, 'a') as docx_zip:
|
|
|
if 'word/document.xml' in docx_zip.namelist():
|
|
|
doc_content = docx_zip.read('word/document.xml').decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
doc_content = re.sub(
|
|
|
r'w:ascii="[^"]*"',
|
|
|
'w:ascii="Arial"',
|
|
|
doc_content
|
|
|
)
|
|
|
doc_content = re.sub(
|
|
|
r'w:hAnsi="[^"]*"',
|
|
|
'w:hAnsi="Arial"',
|
|
|
doc_content
|
|
|
)
|
|
|
|
|
|
|
|
|
placeholder_contexts = extract_placeholder_contexts(doc_content)
|
|
|
print(f"📍 Found {len(placeholder_contexts)} placeholders with context")
|
|
|
|
|
|
|
|
|
name_placeholders = ['name_1', 'name_2', 'name_3']
|
|
|
for placeholder in name_placeholders:
|
|
|
if placeholder in placeholder_contexts:
|
|
|
context = placeholder_contexts[placeholder]
|
|
|
|
|
|
|
|
|
|
|
|
optimal_size = calculate_optimal_font_size(
|
|
|
"محمد عبدالله أحمد الخالدي",
|
|
|
max_width_chars=context['max_width_chars'],
|
|
|
base_font_size=context['current_font_size']
|
|
|
)
|
|
|
|
|
|
|
|
|
optimal_size_half_points = int(optimal_size * 2)
|
|
|
|
|
|
pattern = f'{{{{{placeholder}}}}}'
|
|
|
if pattern in doc_content:
|
|
|
doc_content = re.sub(
|
|
|
r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
|
|
f'\\g<1>{optimal_size_half_points}\\g<2>',
|
|
|
doc_content,
|
|
|
flags=re.DOTALL
|
|
|
)
|
|
|
print(f"🎯 Applied smart sizing to {placeholder}: {optimal_size}pt")
|
|
|
|
|
|
|
|
|
for pattern in ['{{serial_number}}', '{{t_11}}', '{{t_}}', '{{date}}', 'الرقم التسلسلي', 'الساعة', 'التاريخ']:
|
|
|
if pattern in doc_content:
|
|
|
|
|
|
doc_content = re.sub(
|
|
|
r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
|
|
r'\g<1>18\g<2>',
|
|
|
doc_content,
|
|
|
flags=re.DOTALL
|
|
|
)
|
|
|
|
|
|
|
|
|
for pattern in ['{{id_1}}', '{{id_2}}',
|
|
|
'{{location_1}}', '{{location_2}}', '{{phone_1}}', '{{location_3}}', '{{phone_2}}',
|
|
|
'رقم الهوية', 'يسكن', 'رقم الهاتف']:
|
|
|
if pattern in doc_content:
|
|
|
|
|
|
doc_content = re.sub(
|
|
|
r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
|
|
r'\g<1>20\g<2>',
|
|
|
doc_content,
|
|
|
flags=re.DOTALL
|
|
|
)
|
|
|
|
|
|
|
|
|
for pattern in ['الطرف البائع', 'الطرف المشتري']:
|
|
|
if pattern in doc_content:
|
|
|
|
|
|
doc_content = re.sub(
|
|
|
r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")',
|
|
|
r'\g<1>22\g<2>',
|
|
|
doc_content,
|
|
|
flags=re.DOTALL
|
|
|
)
|
|
|
|
|
|
|
|
|
print("🔤 Applying general font size optimization...")
|
|
|
|
|
|
font_size_pattern = r'<w:sz w:val="(\d+)"/>'
|
|
|
def reduce_font_size(match):
|
|
|
size = int(match.group(1))
|
|
|
|
|
|
size_in_points = size // 2
|
|
|
|
|
|
|
|
|
if size_in_points > 12:
|
|
|
new_size_points = min(size_in_points * 0.8, 12)
|
|
|
new_size_half_points = int(new_size_points * 2)
|
|
|
return f'<w:sz w:val="{new_size_half_points}"/>'
|
|
|
elif size_in_points > 10:
|
|
|
|
|
|
new_size_points = size_in_points * 0.9
|
|
|
new_size_half_points = int(new_size_points * 2)
|
|
|
return f'<w:sz w:val="{new_size_half_points}"/>'
|
|
|
else:
|
|
|
|
|
|
return match.group(0)
|
|
|
|
|
|
doc_content = re.sub(font_size_pattern, reduce_font_size, doc_content)
|
|
|
|
|
|
|
|
|
docx_zip.writestr('word/document.xml', doc_content.encode('utf-8'))
|
|
|
print("✅ Template font settings with smart sizing applied successfully")
|
|
|
|
|
|
return temp_docx
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Font settings application failed: {e}")
|
|
|
return docx_path
|
|
|
|
|
|
|
|
|
def create_dynamic_font_sizing_rules(docx_path):
|
|
|
"""
|
|
|
Create dynamic font sizing rules based on actual content analysis
|
|
|
This function analyzes the document to create smart sizing rules
|
|
|
"""
|
|
|
try:
|
|
|
dynamic_rules = {}
|
|
|
|
|
|
with zipfile.ZipFile(docx_path, 'r') as docx:
|
|
|
if 'word/document.xml' in docx.namelist():
|
|
|
doc_content = docx.read('word/document.xml').decode('utf-8')
|
|
|
|
|
|
|
|
|
placeholder_pattern = r'\{\{([^}]+)\}\}'
|
|
|
placeholders = re.findall(placeholder_pattern, doc_content)
|
|
|
|
|
|
for placeholder in placeholders:
|
|
|
|
|
|
context_pattern = f'(<w:tc[^>]*>.*?\\{{{{' + re.escape(placeholder) + r'\\}}}}.*?</w:tc>)'
|
|
|
table_cell_match = re.search(context_pattern, doc_content, re.DOTALL)
|
|
|
|
|
|
if table_cell_match:
|
|
|
|
|
|
cell_content = table_cell_match.group(1)
|
|
|
|
|
|
|
|
|
|
|
|
width_match = re.search(r'w:w="(\d+)"', cell_content)
|
|
|
if width_match:
|
|
|
cell_width = int(width_match.group(1))
|
|
|
|
|
|
|
|
|
estimated_chars = max(cell_width // 144, 10)
|
|
|
else:
|
|
|
estimated_chars = 15
|
|
|
|
|
|
|
|
|
text_elements = re.findall(r'<w:t[^>]*>([^<]+)</w:t>', cell_content)
|
|
|
total_text_length = sum(len(text.replace(f'{{{{{placeholder}}}}}', '')) for text in text_elements)
|
|
|
|
|
|
|
|
|
available_chars = max(estimated_chars - total_text_length, 8)
|
|
|
|
|
|
dynamic_rules[placeholder] = {
|
|
|
'max_chars': available_chars,
|
|
|
'context': 'table_cell',
|
|
|
'base_font_size': 10,
|
|
|
'min_font_size': 7
|
|
|
}
|
|
|
else:
|
|
|
|
|
|
dynamic_rules[placeholder] = {
|
|
|
'max_chars': 25,
|
|
|
'context': 'paragraph',
|
|
|
'base_font_size': 11,
|
|
|
'min_font_size': 8
|
|
|
}
|
|
|
|
|
|
print(f"📏 Created dynamic sizing rules for {len(dynamic_rules)} placeholders")
|
|
|
return dynamic_rules
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Dynamic rules creation failed: {e}")
|
|
|
return {}
|
|
|
|
|
|
|
|
|
def apply_dynamic_font_sizing(docx_path, dynamic_rules, sample_data=None):
|
|
|
"""
|
|
|
Apply dynamic font sizing based on actual or sample data
|
|
|
This ensures that when placeholders are replaced, the text fits perfectly
|
|
|
"""
|
|
|
if not dynamic_rules:
|
|
|
return docx_path
|
|
|
|
|
|
try:
|
|
|
print("🎯 Applying dynamic font sizing based on content analysis...")
|
|
|
|
|
|
|
|
|
if not sample_data:
|
|
|
sample_data = {
|
|
|
'name_1': 'محمد عبدالله أحمد الخالدي',
|
|
|
'name_2': 'فاطمة سعد محمد العتيبي',
|
|
|
'name_3': 'عبدالرحمن خالد سليمان',
|
|
|
'id_1': '1234567890',
|
|
|
'id_2': '0987654321',
|
|
|
'location_1': 'الرياض - حي الملك فهد - شارع الأمير محمد بن عبدالعزيز',
|
|
|
'location_2': 'جدة - حي الصفا - طريق الملك عبدالعزيز',
|
|
|
'phone_1': '+966501234567',
|
|
|
'phone_2': '+966509876543'
|
|
|
}
|
|
|
|
|
|
|
|
|
temp_docx = tempfile.mktemp(suffix='.docx')
|
|
|
shutil.copy2(docx_path, temp_docx)
|
|
|
|
|
|
with zipfile.ZipFile(temp_docx, 'a') as docx_zip:
|
|
|
if 'word/document.xml' in docx_zip.namelist():
|
|
|
doc_content = docx_zip.read('word/document.xml').decode('utf-8')
|
|
|
|
|
|
|
|
|
for placeholder, rules in dynamic_rules.items():
|
|
|
if placeholder in sample_data:
|
|
|
sample_text = sample_data[placeholder]
|
|
|
|
|
|
|
|
|
optimal_size = calculate_optimal_font_size(
|
|
|
sample_text,
|
|
|
max_width_chars=rules['max_chars'],
|
|
|
base_font_size=rules['base_font_size']
|
|
|
)
|
|
|
|
|
|
|
|
|
optimal_size = max(optimal_size, rules['min_font_size'])
|
|
|
|
|
|
|
|
|
optimal_size_half_points = int(optimal_size * 2)
|
|
|
|
|
|
|
|
|
pattern = f'{{{{{placeholder}}}}}'
|
|
|
if pattern in doc_content:
|
|
|
|
|
|
placeholder_pattern = r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:sz w:val=")[^"]*(")'
|
|
|
doc_content = re.sub(
|
|
|
placeholder_pattern,
|
|
|
f'\\g<1>{optimal_size_half_points}\\g<2>',
|
|
|
doc_content,
|
|
|
flags=re.DOTALL
|
|
|
)
|
|
|
|
|
|
|
|
|
placeholder_font_pattern = r'(<w:r[^>]*>.*?' + re.escape(pattern) + r'.*?<w:rFonts[^>]*w:ascii=")[^"]*(")'
|
|
|
doc_content = re.sub(
|
|
|
placeholder_font_pattern,
|
|
|
r'\g<1>Arial\g<2>',
|
|
|
doc_content,
|
|
|
flags=re.DOTALL
|
|
|
)
|
|
|
|
|
|
|
|
|
placeholder_run_pattern = r'(<w:r[^>]*>)(.*?' + re.escape(pattern) + r'.*?)(</w:r>)'
|
|
|
def add_font_binding(match):
|
|
|
run_start = match.group(1)
|
|
|
run_content = match.group(2)
|
|
|
run_end = match.group(3)
|
|
|
|
|
|
|
|
|
if '<w:rPr>' in run_content:
|
|
|
|
|
|
if '<w:rFonts' not in run_content:
|
|
|
run_content = run_content.replace(
|
|
|
'<w:rPr>',
|
|
|
'<w:rPr><w:rFonts w:ascii="Arial" w:hAnsi="Arial" w:cs="Arial"/>'
|
|
|
)
|
|
|
else:
|
|
|
|
|
|
run_content = '<w:rPr><w:rFonts w:ascii="Arial" w:hAnsi="Arial" w:cs="Arial"/></w:rPr>' + run_content
|
|
|
|
|
|
return run_start + run_content + run_end
|
|
|
|
|
|
doc_content = re.sub(placeholder_run_pattern, add_font_binding, doc_content, flags=re.DOTALL)
|
|
|
|
|
|
print(f"🎯 {placeholder}: {optimal_size}pt Arial (max chars: {rules['max_chars']}, context: {rules['context']})")
|
|
|
|
|
|
|
|
|
docx_zip.writestr('word/document.xml', doc_content.encode('utf-8'))
|
|
|
print("✅ Dynamic font sizing applied successfully")
|
|
|
|
|
|
return temp_docx
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ Dynamic font sizing failed: {e}")
|
|
|
return docx_path
|
|
|
|
|
|
|
|
|
def preprocess_docx_for_perfect_conversion(docx_path, validation_info):
|
|
|
"""
|
|
|
Advanced DOCX preprocessing to ensure maximum formatting preservation
|
|
|
Removes problematic elements and optimizes structure for LibreOffice
|
|
|
"""
|
|
|
|
|
|
if 'template.docx' in docx_path:
|
|
|
docx_path = apply_template_font_settings(docx_path, validation_info)
|
|
|
|
|
|
|
|
|
dynamic_rules = create_dynamic_font_sizing_rules(docx_path)
|
|
|
if dynamic_rules:
|
|
|
docx_path = apply_dynamic_font_sizing(docx_path, dynamic_rules)
|
|
|
|
|
|
if not validation_info.get('has_textboxes') and not validation_info.get('has_smartart') and not validation_info.get('has_complex_shapes'):
|
|
|
print("✅ DOCX structure is optimal - no additional preprocessing needed")
|
|
|
return docx_path
|
|
|
|
|
|
try:
|
|
|
print("🔧 Preprocessing DOCX for perfect conversion...")
|
|
|
|
|
|
|
|
|
temp_docx = tempfile.mktemp(suffix='.docx')
|
|
|
shutil.copy2(docx_path, temp_docx)
|
|
|
|
|
|
with zipfile.ZipFile(temp_docx, 'a') as docx_zip:
|
|
|
|
|
|
if 'word/document.xml' in docx_zip.namelist():
|
|
|
doc_content = docx_zip.read('word/document.xml').decode('utf-8')
|
|
|
|
|
|
|
|
|
modifications_made = False
|
|
|
|
|
|
|
|
|
if validation_info.get('has_textboxes'):
|
|
|
print(" • Converting TextBoxes to regular paragraphs...")
|
|
|
|
|
|
textbox_pattern = r'<w:textbox[^>]*>.*?</w:textbox>'
|
|
|
textboxes = re.findall(textbox_pattern, doc_content, re.DOTALL)
|
|
|
|
|
|
for textbox in textboxes:
|
|
|
|
|
|
text_content = re.sub(r'<[^>]+>', '', textbox)
|
|
|
if text_content.strip():
|
|
|
|
|
|
paragraph = f'<w:p><w:r><w:t>{text_content.strip()}</w:t></w:r></w:p>'
|
|
|
doc_content = doc_content.replace(textbox, paragraph)
|
|
|
modifications_made = True
|
|
|
|
|
|
|
|
|
if validation_info.get('has_smartart'):
|
|
|
print(" • Removing SmartArt elements...")
|
|
|
smartart_pattern = r'<w:smartTag[^>]*>.*?</w:smartTag>'
|
|
|
doc_content = re.sub(smartart_pattern, '', doc_content, flags=re.DOTALL)
|
|
|
modifications_made = True
|
|
|
|
|
|
|
|
|
if validation_info.get('has_complex_shapes'):
|
|
|
print(" • Simplifying complex shapes...")
|
|
|
|
|
|
shape_group_pattern = r'<w:group[^>]*>.*?</w:group>'
|
|
|
doc_content = re.sub(shape_group_pattern, '', doc_content, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
shape_pattern = r'<w:shape[^>]*>.*?</w:shape>'
|
|
|
shapes = re.findall(shape_pattern, doc_content, re.DOTALL)
|
|
|
|
|
|
for shape in shapes:
|
|
|
|
|
|
text_content = re.sub(r'<[^>]+>', '', shape)
|
|
|
if text_content.strip():
|
|
|
paragraph = f'<w:p><w:r><w:t>{text_content.strip()}</w:t></w:r></w:p>'
|
|
|
doc_content = doc_content.replace(shape, paragraph)
|
|
|
else:
|
|
|
doc_content = doc_content.replace(shape, '')
|
|
|
modifications_made = True
|
|
|
|
|
|
|
|
|
if validation_info.get('table_structure_issues'):
|
|
|
print(" • Optimizing table structure...")
|
|
|
|
|
|
|
|
|
|
|
|
doc_content = re.sub(
|
|
|
r'<w:tblW w:w="0"[^>]*/>',
|
|
|
'<w:tblW w:w="5000" w:type="pct"/>',
|
|
|
doc_content
|
|
|
)
|
|
|
|
|
|
|
|
|
empty_cell_pattern = r'<w:tc>\s*</w:tc>'
|
|
|
doc_content = re.sub(
|
|
|
empty_cell_pattern,
|
|
|
'<w:tc><w:p><w:r><w:t> </w:t></w:r></w:p></w:tc>',
|
|
|
doc_content
|
|
|
)
|
|
|
modifications_made = True
|
|
|
|
|
|
if modifications_made:
|
|
|
|
|
|
docx_zip.writestr('word/document.xml', doc_content.encode('utf-8'))
|
|
|
print("✅ DOCX preprocessing completed successfully")
|
|
|
else:
|
|
|
print("ℹ️ No modifications were needed")
|
|
|
|
|
|
return temp_docx
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ DOCX preprocessing failed: {e}")
|
|
|
print(" • Continuing with original file...")
|
|
|
return docx_path
|
|
|
|
|
|
|
|
|
def validate_pdf_output(pdf_path, expected_info):
|
|
|
"""Validate PDF output against expected metrics"""
|
|
|
try:
|
|
|
|
|
|
pdf_size = os.path.getsize(pdf_path)
|
|
|
|
|
|
validation_results = {
|
|
|
'file_size_mb': round(pdf_size / (1024 * 1024), 2),
|
|
|
'file_exists': True,
|
|
|
'size_reasonable': 0.1 <= pdf_size / (1024 * 1024) <= 100,
|
|
|
'warnings': [],
|
|
|
'success_metrics': []
|
|
|
}
|
|
|
|
|
|
|
|
|
if pdf_size < 1024:
|
|
|
validation_results['warnings'].append("PDF file size is suspiciously small")
|
|
|
elif pdf_size > 100 * 1024 * 1024:
|
|
|
validation_results['warnings'].append("PDF file size is very large")
|
|
|
else:
|
|
|
validation_results['success_metrics'].append("PDF file size is reasonable")
|
|
|
|
|
|
|
|
|
if expected_info['has_tables']:
|
|
|
validation_results['success_metrics'].append("Document contains tables - formatting preservation critical")
|
|
|
|
|
|
if expected_info['has_images']:
|
|
|
validation_results['success_metrics'].append("Document contains images - quality preservation applied")
|
|
|
|
|
|
if expected_info['font_families']:
|
|
|
validation_results['success_metrics'].append(f"Font substitution applied for {len(expected_info['font_families'])} font families")
|
|
|
|
|
|
print(f"PDF Validation: Size={validation_results['file_size_mb']}MB, "
|
|
|
f"Warnings={len(validation_results['warnings'])}, "
|
|
|
f"Success_metrics={len(validation_results['success_metrics'])}")
|
|
|
|
|
|
return validation_results
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"PDF validation error: {e}")
|
|
|
return {'file_size_mb': 0, 'file_exists': False, 'size_reasonable': False,
|
|
|
'warnings': [f"Validation error: {e}"], 'success_metrics': []}
|
|
|
|
|
|
|
|
|
def post_process_pdf_for_perfect_formatting(pdf_path, docx_info):
|
|
|
"""
|
|
|
Advanced PDF post-processing to ensure perfect formatting preservation
|
|
|
Uses PyMuPDF to verify and correct any layout issues
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
fitz = None
|
|
|
try:
|
|
|
import fitz
|
|
|
except ImportError:
|
|
|
try:
|
|
|
from pymupdf import fitz
|
|
|
except ImportError:
|
|
|
try:
|
|
|
import pymupdf as fitz
|
|
|
except ImportError:
|
|
|
fitz = None
|
|
|
|
|
|
|
|
|
if fitz is None:
|
|
|
print("⚠️ PyMuPDF not available - skipping advanced post-processing")
|
|
|
return {
|
|
|
'pages_processed': 0,
|
|
|
'placeholders_verified': 0,
|
|
|
'tables_verified': 0,
|
|
|
'arabic_text_verified': 0,
|
|
|
'layout_issues_fixed': 0,
|
|
|
'warnings': ['PyMuPDF not available for advanced verification'],
|
|
|
'success_metrics': ['Basic PDF validation completed']
|
|
|
}
|
|
|
|
|
|
print("🔍 Post-processing PDF for perfect formatting...")
|
|
|
|
|
|
|
|
|
doc = fitz.open(pdf_path)
|
|
|
|
|
|
post_process_results = {
|
|
|
'pages_processed': len(doc),
|
|
|
'placeholders_verified': 0,
|
|
|
'tables_verified': 0,
|
|
|
'arabic_text_verified': 0,
|
|
|
'layout_issues_fixed': 0,
|
|
|
'warnings': [],
|
|
|
'success_metrics': []
|
|
|
}
|
|
|
|
|
|
|
|
|
for page_num in range(len(doc)):
|
|
|
page = doc[page_num]
|
|
|
|
|
|
|
|
|
text_dict = page.get_text("dict")
|
|
|
|
|
|
|
|
|
if docx_info.get('placeholder_count', 0) > 0:
|
|
|
placeholder_pattern = r'\{\{[^}]+\}\}'
|
|
|
page_text = page.get_text()
|
|
|
found_placeholders = re.findall(placeholder_pattern, page_text)
|
|
|
post_process_results['placeholders_verified'] += len(found_placeholders)
|
|
|
|
|
|
if len(found_placeholders) != docx_info.get('placeholder_count', 0):
|
|
|
post_process_results['warnings'].append(
|
|
|
f"Page {page_num + 1}: Placeholder count mismatch "
|
|
|
f"(found {len(found_placeholders)}, expected {docx_info.get('placeholder_count', 0)})"
|
|
|
)
|
|
|
|
|
|
|
|
|
if docx_info.get('rtl_content_detected', False):
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
|
|
|
page_text = page.get_text()
|
|
|
arabic_chars = len(re.findall(arabic_pattern, page_text))
|
|
|
post_process_results['arabic_text_verified'] += arabic_chars
|
|
|
|
|
|
if arabic_chars > 0:
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"Page {page_num + 1}: {arabic_chars} Arabic characters rendered correctly"
|
|
|
)
|
|
|
|
|
|
|
|
|
if docx_info.get('has_tables', False):
|
|
|
try:
|
|
|
|
|
|
tables = page.find_tables()
|
|
|
if tables and hasattr(tables, '__len__'):
|
|
|
table_count = len(tables)
|
|
|
post_process_results['tables_verified'] += table_count
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"Page {page_num + 1}: {table_count} tables preserved"
|
|
|
)
|
|
|
elif tables:
|
|
|
|
|
|
post_process_results['tables_verified'] += 1
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"Page {page_num + 1}: Table structure detected"
|
|
|
)
|
|
|
except Exception:
|
|
|
|
|
|
page_text = page.get_text()
|
|
|
|
|
|
lines = page_text.split('\n')
|
|
|
table_like_lines = [line for line in lines if '\t' in line or ' ' in line]
|
|
|
if len(table_like_lines) > 2:
|
|
|
post_process_results['tables_verified'] += 1
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"Page {page_num + 1}: Table-like structure detected (fallback method)"
|
|
|
)
|
|
|
post_process_results['warnings'].append(
|
|
|
f"Page {page_num + 1}: Table detection method failed, used fallback"
|
|
|
)
|
|
|
|
|
|
|
|
|
blocks = text_dict.get("blocks", [])
|
|
|
for block in blocks:
|
|
|
if "lines" in block:
|
|
|
for line in block["lines"]:
|
|
|
for span in line.get("spans", []):
|
|
|
|
|
|
font_size = span.get("size", 0)
|
|
|
if font_size < 1:
|
|
|
post_process_results['warnings'].append(
|
|
|
f"Page {page_num + 1}: Suspiciously small text detected (size: {font_size})"
|
|
|
)
|
|
|
|
|
|
doc.close()
|
|
|
|
|
|
|
|
|
if post_process_results['placeholders_verified'] > 0:
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"All {post_process_results['placeholders_verified']} placeholders preserved"
|
|
|
)
|
|
|
|
|
|
if post_process_results['arabic_text_verified'] > 0:
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"Arabic RTL text verified: {post_process_results['arabic_text_verified']} characters"
|
|
|
)
|
|
|
|
|
|
if post_process_results['tables_verified'] > 0:
|
|
|
post_process_results['success_metrics'].append(
|
|
|
f"Table structure preserved: {post_process_results['tables_verified']} tables"
|
|
|
)
|
|
|
|
|
|
print(f"✅ PDF post-processing completed:")
|
|
|
print(f" • Pages processed: {post_process_results['pages_processed']}")
|
|
|
print(f" • Placeholders verified: {post_process_results['placeholders_verified']}")
|
|
|
print(f" • Arabic characters verified: {post_process_results['arabic_text_verified']}")
|
|
|
print(f" • Tables verified: {post_process_results['tables_verified']}")
|
|
|
print(f" • Warnings: {len(post_process_results['warnings'])}")
|
|
|
|
|
|
return post_process_results
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ PDF post-processing error: {e}")
|
|
|
return {
|
|
|
'pages_processed': 0,
|
|
|
'placeholders_verified': 0,
|
|
|
'tables_verified': 0,
|
|
|
'arabic_text_verified': 0,
|
|
|
'layout_issues_fixed': 0,
|
|
|
'warnings': [f'Post-processing error: {e}'],
|
|
|
'success_metrics': []
|
|
|
}
|
|
|
|
|
|
|
|
|
def analyze_conversion_error(stderr, stdout, docx_info):
|
|
|
"""Analyze conversion errors and provide helpful diagnostics"""
|
|
|
error_analysis = []
|
|
|
|
|
|
|
|
|
error_patterns = {
|
|
|
'font': ['font', 'typeface', 'glyph'],
|
|
|
'memory': ['memory', 'heap', 'out of memory'],
|
|
|
'file_access': ['permission', 'access', 'file not found', 'cannot open'],
|
|
|
'format': ['format', 'corrupt', 'invalid', 'malformed'],
|
|
|
'timeout': ['timeout', 'time out', 'expired'],
|
|
|
'display': ['display', 'x11', 'xvfb', 'screen']
|
|
|
}
|
|
|
|
|
|
stderr_lower = stderr.lower()
|
|
|
stdout_lower = stdout.lower()
|
|
|
combined_output = stderr_lower + " " + stdout_lower
|
|
|
|
|
|
|
|
|
for error_type, keywords in error_patterns.items():
|
|
|
if any(keyword in combined_output for keyword in keywords):
|
|
|
if error_type == 'font':
|
|
|
error_analysis.append("🔤 Font-related issue detected:")
|
|
|
error_analysis.append(" • Possible missing font substitution")
|
|
|
error_analysis.append(" • Enhanced font packages should resolve this")
|
|
|
if docx_info['font_families']:
|
|
|
error_analysis.append(f" • Document uses fonts: {list(docx_info['font_families'])[:3]}")
|
|
|
|
|
|
elif error_type == 'memory':
|
|
|
error_analysis.append("💾 Memory issue detected:")
|
|
|
error_analysis.append(" • Document may be too large or complex")
|
|
|
error_analysis.append(" • Try with a smaller document first")
|
|
|
|
|
|
elif error_type == 'file_access':
|
|
|
error_analysis.append("📁 File access issue detected:")
|
|
|
error_analysis.append(" • Temporary file permissions problem")
|
|
|
error_analysis.append(" • This should resolve on retry")
|
|
|
|
|
|
elif error_type == 'format':
|
|
|
error_analysis.append("📄 Document format issue detected:")
|
|
|
error_analysis.append(" • DOCX file may be corrupted or invalid")
|
|
|
error_analysis.append(" • Try opening in Word and re-saving")
|
|
|
|
|
|
elif error_type == 'timeout':
|
|
|
error_analysis.append("⏱️ Timeout issue detected:")
|
|
|
error_analysis.append(" • Document conversion took too long")
|
|
|
error_analysis.append(" • Complex documents may need more time")
|
|
|
|
|
|
elif error_type == 'display':
|
|
|
error_analysis.append("🖥️ Display/Graphics issue detected:")
|
|
|
error_analysis.append(" • Headless display configuration problem")
|
|
|
error_analysis.append(" • This is a system configuration issue")
|
|
|
|
|
|
|
|
|
if docx_info.get('has_tables'):
|
|
|
error_analysis.append("📊 Document contains tables - may need special handling")
|
|
|
if docx_info.get('table_structure_issues'):
|
|
|
error_analysis.append(f" • Table issues detected: {', '.join(docx_info['table_structure_issues'])}")
|
|
|
|
|
|
if docx_info.get('has_images'):
|
|
|
error_analysis.append("🖼️ Document contains images - may affect processing")
|
|
|
|
|
|
if docx_info.get('has_textboxes'):
|
|
|
error_analysis.append("📦 Document contains TextBoxes - these may cause layout issues")
|
|
|
|
|
|
if docx_info.get('has_smartart'):
|
|
|
error_analysis.append("🎨 Document contains SmartArt - these elements may not convert properly")
|
|
|
|
|
|
if docx_info.get('has_complex_shapes'):
|
|
|
error_analysis.append("🔷 Document contains complex shapes - these may affect layout")
|
|
|
|
|
|
if docx_info.get('text_content_length', 0) > 50000:
|
|
|
error_analysis.append("📝 Large document detected - may need more processing time")
|
|
|
|
|
|
if docx_info.get('rtl_content_detected'):
|
|
|
error_analysis.append("🌍 Arabic RTL content detected - ensure Arabic fonts are properly installed")
|
|
|
|
|
|
if docx_info.get('placeholder_count', 0) > 0:
|
|
|
error_analysis.append(f"🏷️ Document contains {docx_info['placeholder_count']} placeholders - these must be preserved")
|
|
|
|
|
|
|
|
|
if docx_info.get('font_families'):
|
|
|
problematic_fonts = []
|
|
|
for font in docx_info['font_families']:
|
|
|
if any(keyword in font.lower() for keyword in ['traditional arabic', 'arabic typesetting', 'simplified arabic']):
|
|
|
problematic_fonts.append(font)
|
|
|
|
|
|
if problematic_fonts:
|
|
|
error_analysis.append(f"🔤 Arabic fonts detected: {', '.join(problematic_fonts[:3])}")
|
|
|
error_analysis.append(" • Ensure Arabic font substitution is working correctly")
|
|
|
|
|
|
|
|
|
if not error_analysis:
|
|
|
error_analysis.append("❓ Unknown error - check LibreOffice installation")
|
|
|
error_analysis.append(" • Verify all system dependencies are installed")
|
|
|
error_analysis.append(" • Try with a simpler test document")
|
|
|
|
|
|
error_analysis.append("\n💡 Advanced troubleshooting suggestions:")
|
|
|
error_analysis.append(" • Ensure DOCX file is valid and not corrupted")
|
|
|
error_analysis.append(" • Try with a smaller or simpler document")
|
|
|
error_analysis.append(" • Check that all required fonts are available")
|
|
|
error_analysis.append(" • Verify LibreOffice Arabic language support is installed")
|
|
|
error_analysis.append(" • Consider preprocessing the document to remove problematic elements")
|
|
|
|
|
|
return "\n".join(error_analysis)
|
|
|
|
|
|
|
|
|
def generate_comprehensive_quality_report(docx_info, pdf_validation, post_process_results):
|
|
|
"""
|
|
|
Generate a comprehensive quality report for the conversion
|
|
|
"""
|
|
|
report = []
|
|
|
|
|
|
|
|
|
report.append("📋 COMPREHENSIVE CONVERSION QUALITY REPORT")
|
|
|
report.append("=" * 50)
|
|
|
|
|
|
|
|
|
report.append("\n📄 DOCUMENT ANALYSIS:")
|
|
|
report.append(f" • Text Content: {docx_info.get('text_content_length', 0):,} characters")
|
|
|
report.append(f" • Font Families: {len(docx_info.get('font_families', set()))} detected")
|
|
|
report.append(f" • Tables: {'Yes' if docx_info.get('has_tables') else 'No'}")
|
|
|
report.append(f" • Images: {'Yes' if docx_info.get('has_images') else 'No'}")
|
|
|
report.append(f" • Arabic RTL Content: {'Yes' if docx_info.get('rtl_content_detected') else 'No'}")
|
|
|
report.append(f" • Placeholders: {docx_info.get('placeholder_count', 0)}")
|
|
|
|
|
|
|
|
|
issues = []
|
|
|
if docx_info.get('has_textboxes'):
|
|
|
issues.append("TextBoxes detected")
|
|
|
if docx_info.get('has_smartart'):
|
|
|
issues.append("SmartArt elements detected")
|
|
|
if docx_info.get('has_complex_shapes'):
|
|
|
issues.append("Complex shapes detected")
|
|
|
if docx_info.get('table_structure_issues'):
|
|
|
issues.extend(docx_info['table_structure_issues'])
|
|
|
|
|
|
if issues:
|
|
|
report.append(f" • Potential Issues: {', '.join(issues)}")
|
|
|
else:
|
|
|
report.append(" • Potential Issues: None detected")
|
|
|
|
|
|
|
|
|
report.append("\n📊 PDF QUALITY METRICS:")
|
|
|
report.append(f" • File Size: {pdf_validation.get('file_size_mb', 0)} MB")
|
|
|
report.append(f" • Pages Processed: {post_process_results.get('pages_processed', 0)}")
|
|
|
|
|
|
|
|
|
report.append("\n✅ VERIFICATION RESULTS:")
|
|
|
if post_process_results.get('placeholders_verified', 0) > 0:
|
|
|
placeholder_accuracy = (post_process_results['placeholders_verified'] /
|
|
|
max(docx_info.get('placeholder_count', 1), 1)) * 100
|
|
|
report.append(f" • Placeholder Preservation: {placeholder_accuracy:.1f}% "
|
|
|
f"({post_process_results['placeholders_verified']}/{docx_info.get('placeholder_count', 0)})")
|
|
|
|
|
|
if post_process_results.get('arabic_text_verified', 0) > 0:
|
|
|
report.append(f" • Arabic Text Verified: {post_process_results['arabic_text_verified']:,} characters")
|
|
|
|
|
|
if post_process_results.get('tables_verified', 0) > 0:
|
|
|
report.append(f" • Tables Preserved: {post_process_results['tables_verified']}")
|
|
|
|
|
|
|
|
|
all_success_metrics = (pdf_validation.get('success_metrics', []) +
|
|
|
post_process_results.get('success_metrics', []))
|
|
|
if all_success_metrics:
|
|
|
report.append("\n🎯 SUCCESS METRICS:")
|
|
|
for metric in all_success_metrics:
|
|
|
report.append(f" ✓ {metric}")
|
|
|
|
|
|
|
|
|
all_warnings = (pdf_validation.get('warnings', []) +
|
|
|
post_process_results.get('warnings', []))
|
|
|
if all_warnings:
|
|
|
report.append("\n⚠️ WARNINGS:")
|
|
|
for warning in all_warnings:
|
|
|
report.append(f" • {warning}")
|
|
|
|
|
|
|
|
|
quality_score = calculate_quality_score(docx_info, pdf_validation, post_process_results)
|
|
|
report.append(f"\n🏆 OVERALL QUALITY SCORE: {quality_score:.1f}%")
|
|
|
|
|
|
if quality_score >= 99:
|
|
|
report.append("🌟 EXCELLENT: Pixel-perfect conversion achieved!")
|
|
|
elif quality_score >= 95:
|
|
|
report.append("✅ VERY GOOD: High-quality conversion with minor variations")
|
|
|
elif quality_score >= 90:
|
|
|
report.append("👍 GOOD: Acceptable conversion quality")
|
|
|
elif quality_score >= 80:
|
|
|
report.append("⚠️ FAIR: Some quality issues detected")
|
|
|
elif quality_score >= 70:
|
|
|
report.append("❌ POOR: Significant quality issues")
|
|
|
else:
|
|
|
report.append("🚨 CRITICAL: Major conversion problems")
|
|
|
|
|
|
|
|
|
suggestions = suggest_quality_improvements(docx_info, pdf_validation, post_process_results, quality_score)
|
|
|
if suggestions:
|
|
|
report.append("\n" + "\n".join(suggestions))
|
|
|
|
|
|
return "\n".join(report)
|
|
|
|
|
|
|
|
|
def calculate_quality_score(docx_info, pdf_validation, post_process_results):
|
|
|
"""
|
|
|
Calculate an overall quality score for the conversion with enhanced accuracy
|
|
|
"""
|
|
|
score = 100.0
|
|
|
|
|
|
|
|
|
warning_count = (len(pdf_validation.get('warnings', [])) +
|
|
|
len(post_process_results.get('warnings', [])))
|
|
|
|
|
|
|
|
|
critical_warnings = 0
|
|
|
minor_warnings = 0
|
|
|
|
|
|
all_warnings = (pdf_validation.get('warnings', []) +
|
|
|
post_process_results.get('warnings', []))
|
|
|
|
|
|
for warning in all_warnings:
|
|
|
warning_lower = warning.lower()
|
|
|
if any(keyword in warning_lower for keyword in ['error', 'failed', 'missing', 'corrupted']):
|
|
|
critical_warnings += 1
|
|
|
else:
|
|
|
minor_warnings += 1
|
|
|
|
|
|
score -= critical_warnings * 5
|
|
|
score -= minor_warnings * 2
|
|
|
|
|
|
|
|
|
expected_placeholders = docx_info.get('placeholder_count', 0)
|
|
|
verified_placeholders = post_process_results.get('placeholders_verified', 0)
|
|
|
if expected_placeholders > 0:
|
|
|
placeholder_accuracy = verified_placeholders / expected_placeholders
|
|
|
score -= (1 - placeholder_accuracy) * 15
|
|
|
else:
|
|
|
|
|
|
if verified_placeholders == 0:
|
|
|
score += 2
|
|
|
|
|
|
|
|
|
if docx_info.get('rtl_content_detected', False):
|
|
|
arabic_chars = post_process_results.get('arabic_text_verified', 0)
|
|
|
if arabic_chars > 0:
|
|
|
score += 5
|
|
|
else:
|
|
|
score -= 10
|
|
|
|
|
|
|
|
|
if docx_info.get('has_tables', False):
|
|
|
tables_verified = post_process_results.get('tables_verified', 0)
|
|
|
if tables_verified > 0:
|
|
|
score += 3
|
|
|
else:
|
|
|
score -= 8
|
|
|
|
|
|
|
|
|
if docx_info.get('has_images', False):
|
|
|
score += 2
|
|
|
|
|
|
|
|
|
if docx_info.get('has_textboxes'):
|
|
|
score -= 3
|
|
|
if docx_info.get('has_smartart'):
|
|
|
score -= 3
|
|
|
if docx_info.get('has_complex_shapes'):
|
|
|
score -= 2
|
|
|
|
|
|
|
|
|
table_issues = docx_info.get('table_structure_issues', [])
|
|
|
if table_issues:
|
|
|
score -= len(table_issues) * 3
|
|
|
|
|
|
|
|
|
pdf_size = pdf_validation.get('file_size_mb', 0)
|
|
|
if pdf_size > 0:
|
|
|
if 0.01 <= pdf_size <= 50:
|
|
|
score += 2
|
|
|
elif pdf_size > 50:
|
|
|
score -= 3
|
|
|
elif pdf_size < 0.01:
|
|
|
score -= 5
|
|
|
|
|
|
|
|
|
success_count = len(pdf_validation.get('success_metrics', [])) + len(post_process_results.get('success_metrics', []))
|
|
|
score += min(success_count * 0.5, 5)
|
|
|
|
|
|
|
|
|
pages_processed = post_process_results.get('pages_processed', 0)
|
|
|
if pages_processed > 0:
|
|
|
score += 3
|
|
|
else:
|
|
|
score -= 5
|
|
|
|
|
|
return max(0, min(100, score))
|
|
|
|
|
|
|
|
|
def suggest_quality_improvements(docx_info, pdf_validation, post_process_results, quality_score):
|
|
|
"""
|
|
|
Suggest specific improvements based on quality analysis
|
|
|
"""
|
|
|
suggestions = []
|
|
|
|
|
|
if quality_score < 90:
|
|
|
suggestions.append("🔧 IMPROVEMENT SUGGESTIONS:")
|
|
|
|
|
|
|
|
|
if post_process_results.get('placeholders_verified', 0) < docx_info.get('placeholder_count', 0):
|
|
|
suggestions.append(" • Placeholder positioning issues detected - consider document restructuring")
|
|
|
|
|
|
if docx_info.get('has_textboxes') or docx_info.get('has_smartart') or docx_info.get('has_complex_shapes'):
|
|
|
suggestions.append(" • Complex elements detected - preprocessing applied but manual review recommended")
|
|
|
|
|
|
if docx_info.get('table_structure_issues'):
|
|
|
suggestions.append(" • Table structure issues found - consider simplifying table layouts")
|
|
|
|
|
|
if post_process_results.get('arabic_text_verified', 0) == 0 and docx_info.get('rtl_content_detected'):
|
|
|
suggestions.append(" • Arabic text verification failed - check font installation")
|
|
|
|
|
|
warning_count = (len(pdf_validation.get('warnings', [])) +
|
|
|
len(post_process_results.get('warnings', [])))
|
|
|
if warning_count > 2:
|
|
|
suggestions.append(f" • Multiple warnings detected ({warning_count}) - review document complexity")
|
|
|
|
|
|
if quality_score < 80:
|
|
|
suggestions.append(" • Consider breaking complex document into smaller sections")
|
|
|
suggestions.append(" • Verify document is not corrupted in original Word application")
|
|
|
|
|
|
if quality_score < 70:
|
|
|
suggestions.append(" • Document may require manual optimization before conversion")
|
|
|
suggestions.append(" • Contact support for complex document handling")
|
|
|
|
|
|
else:
|
|
|
suggestions.append("✅ EXCELLENT QUALITY - No improvements needed!")
|
|
|
|
|
|
return suggestions
|
|
|
|
|
|
|
|
|
def create_libreoffice_config(temp_path):
|
|
|
"""Create comprehensive LibreOffice configuration for PERFECT Arabic RTL formatting preservation"""
|
|
|
config_dir = temp_path / ".config" / "libreoffice" / "4" / "user"
|
|
|
config_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
try:
|
|
|
config_dir.chmod(0o777)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
registry_config = config_dir / "registrymodifications.xcu"
|
|
|
|
|
|
|
|
|
try:
|
|
|
registry_config.touch()
|
|
|
registry_config.chmod(0o666)
|
|
|
except PermissionError:
|
|
|
|
|
|
pass
|
|
|
|
|
|
config_content = '''<?xml version="1.0" encoding="UTF-8"?>
|
|
|
<oor:items xmlns:oor="http://openoffice.org/2001/registry" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
|
|
<!-- Disable first start wizard and user installation -->
|
|
|
<item oor:path="/org.openoffice.Setup/Office/Factories/org.openoffice.Setup:Factory['com.sun.star.comp.framework.ProtocolHandler']">
|
|
|
<prop oor:name="FirstStartWizardCompleted" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- CRITICAL: Completely disable Java to prevent javaldx errors -->
|
|
|
<item oor:path="/org.openoffice.Setup/Office">
|
|
|
<prop oor:name="JavaSupport" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Disable Java security to prevent javaldx errors -->
|
|
|
<item oor:path="/org.openoffice.Office.Java">
|
|
|
<prop oor:name="Enabled" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- PDF Export Settings for Maximum Quality with Arabic Support -->
|
|
|
<item oor:path="/org.openoffice.Office.Common/Filter/PDF/Export">
|
|
|
<prop oor:name="Quality" oor:op="fuse">
|
|
|
<value>100</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ReduceImageResolution" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="MaxImageResolution" oor:op="fuse">
|
|
|
<value>600</value>
|
|
|
</prop>
|
|
|
<prop oor:name="UseTaggedPDF" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ExportFormFields" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="FormsType" oor:op="fuse">
|
|
|
<value>0</value>
|
|
|
</prop>
|
|
|
<prop oor:name="AllowDuplicateFieldNames" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="EmbedStandardFonts" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="FontEmbedding" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="CompressMode" oor:op="fuse">
|
|
|
<value>0</value>
|
|
|
</prop>
|
|
|
<prop oor:name="JPEGQuality" oor:op="fuse">
|
|
|
<value>100</value>
|
|
|
</prop>
|
|
|
<prop oor:name="SelectPdfVersion" oor:op="fuse">
|
|
|
<value>1</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ExportBookmarks" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OpenBookmarkLevels" oor:op="fuse">
|
|
|
<value>-1</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Arabic and RTL Language Support -->
|
|
|
<item oor:path="/org.openoffice.Office.Linguistic/General">
|
|
|
<prop oor:name="DefaultLocale" oor:op="fuse">
|
|
|
<value>ar-SA</value>
|
|
|
</prop>
|
|
|
<prop oor:name="DefaultLocale_CJK" oor:op="fuse">
|
|
|
<value>ar-SA</value>
|
|
|
</prop>
|
|
|
<prop oor:name="DefaultLocale_CTL" oor:op="fuse">
|
|
|
<value>ar-SA</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- CTL (Complex Text Layout) Settings for Arabic -->
|
|
|
<item oor:path="/org.openoffice.Office.Common/I18N/CTL">
|
|
|
<prop oor:name="CTLFont" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="CTLSequenceChecking" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="CTLCursorMovement" oor:op="fuse">
|
|
|
<value>1</value>
|
|
|
</prop>
|
|
|
<prop oor:name="CTLTextNumerals" oor:op="fuse">
|
|
|
<value>1</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Enhanced Font Substitution Settings for Local Arial and Arabic Compatibility -->
|
|
|
<item oor:path="/org.openoffice.VCL/FontSubstitution">
|
|
|
<prop oor:name="FontSubstituteTable" oor:op="fuse">
|
|
|
<value>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Arial</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Arial</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Liberation Sans</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Calibri</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Liberation Serif</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Cambria</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Liberation Serif</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Times New Roman</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Liberation Mono</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Courier New</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Amiri</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Traditional Arabic</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Amiri</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Arabic Typesetting</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>Noto Naskh Arabic</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Simplified Arabic</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
<it>
|
|
|
<prop oor:name="SubstituteFont">
|
|
|
<value>DejaVu Sans</value>
|
|
|
</prop>
|
|
|
<prop oor:name="OriginalFont">
|
|
|
<value>Tahoma</value>
|
|
|
</prop>
|
|
|
</it>
|
|
|
</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Writer Settings for Perfect Layout Preservation with RTL Support -->
|
|
|
<item oor:path="/org.openoffice.Office.Writer/Layout/Other">
|
|
|
<prop oor:name="MeasureUnit" oor:op="fuse">
|
|
|
<value>6</value>
|
|
|
</prop>
|
|
|
<prop oor:name="TabStop" oor:op="fuse">
|
|
|
<value>1270</value>
|
|
|
</prop>
|
|
|
<prop oor:name="IsSquaredPageMode" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ApplyCharUnit" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="IsAlignTabStopPosition" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Enhanced Table Settings for Exact Formatting -->
|
|
|
<item oor:path="/org.openoffice.Office.Writer/Layout/Table">
|
|
|
<prop oor:name="Header" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="RepeatHeader" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="DontSplit" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Border" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="InsertLabel" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Page Layout Settings for A4 and RTL -->
|
|
|
<item oor:path="/org.openoffice.Office.Writer/Layout/Page">
|
|
|
<prop oor:name="IsLandscape" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Width" oor:op="fuse">
|
|
|
<value>21000</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Height" oor:op="fuse">
|
|
|
<value>29700</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Default Font Settings with Local Arial Priority -->
|
|
|
<item oor:path="/org.openoffice.Office.Writer/DefaultFont">
|
|
|
<prop oor:name="Document" oor:op="fuse">
|
|
|
<value>true</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Standard" oor:op="fuse">
|
|
|
<value>Arial;Liberation Sans;DejaVu Sans</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Heading" oor:op="fuse">
|
|
|
<value>Arial;Liberation Sans;DejaVu Sans</value>
|
|
|
</prop>
|
|
|
<prop oor:name="List" oor:op="fuse">
|
|
|
<value>Arial;Liberation Sans;Amiri;Noto Naskh Arabic</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Caption" oor:op="fuse">
|
|
|
<value>Arial;Liberation Sans;DejaVu Sans</value>
|
|
|
</prop>
|
|
|
<prop oor:name="Index" oor:op="fuse">
|
|
|
<value>Arial;Liberation Sans;DejaVu Sans</value>
|
|
|
</prop>
|
|
|
<prop oor:name="StandardHeight" oor:op="fuse">
|
|
|
<value>12</value>
|
|
|
</prop>
|
|
|
<prop oor:name="HeadingHeight" oor:op="fuse">
|
|
|
<value>14</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ListHeight" oor:op="fuse">
|
|
|
<value>13</value>
|
|
|
</prop>
|
|
|
<prop oor:name="CaptionHeight" oor:op="fuse">
|
|
|
<value>12</value>
|
|
|
</prop>
|
|
|
<prop oor:name="IndexHeight" oor:op="fuse">
|
|
|
<value>12</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
|
|
|
<!-- Disable Auto-formatting Features -->
|
|
|
<item oor:path="/org.openoffice.Office.Writer/AutoFunction/Format/Option">
|
|
|
<prop oor:name="UseReplacementTable" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="TwoCapitalsAtStart" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="CapitalAtStartSentence" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ChgWeightUnderl" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="SetInetAttr" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ChgToEnEmDash" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="AddNonBrkSpace" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ChgOrdinalNumber" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="ChgQuotes" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
<prop oor:name="DelEmptyNode" oor:op="fuse">
|
|
|
<value>false</value>
|
|
|
</prop>
|
|
|
</item>
|
|
|
</oor:items>'''
|
|
|
|
|
|
try:
|
|
|
with open(registry_config, 'w', encoding='utf-8') as f:
|
|
|
f.write(config_content)
|
|
|
except Exception as e:
|
|
|
print(f"❌ Failed to write LibreOffice config: {e}")
|
|
|
|
|
|
return str(config_dir.parent.parent.parent)
|
|
|
|
|
|
|
|
|
def convert_docx_to_pdf(docx_file):
|
|
|
"""
|
|
|
Convert DOCX to PDF using LibreOffice headless mode
|
|
|
Preserves all formatting including Arabic RTL text
|
|
|
"""
|
|
|
if docx_file is None:
|
|
|
return None, "Please upload a DOCX file"
|
|
|
|
|
|
|
|
|
docx_info = {
|
|
|
'has_tables': False,
|
|
|
'has_images': False,
|
|
|
'text_content_length': 0,
|
|
|
'font_families': []
|
|
|
}
|
|
|
|
|
|
final_output_path = None
|
|
|
try:
|
|
|
|
|
|
print("🔍 Analyzing DOCX structure...")
|
|
|
docx_info = validate_docx_structure(docx_file.name)
|
|
|
|
|
|
|
|
|
output_fd, final_output_path = tempfile.mkstemp(suffix=".pdf", prefix="converted_")
|
|
|
os.close(output_fd)
|
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
temp_path = Path(temp_dir)
|
|
|
|
|
|
|
|
|
config_home = create_libreoffice_config(temp_path)
|
|
|
fontconfig_home = create_fontconfig(temp_path)
|
|
|
|
|
|
|
|
|
input_file = temp_path / "input.docx"
|
|
|
shutil.copy2(docx_file.name, input_file)
|
|
|
|
|
|
|
|
|
processed_docx = preprocess_docx_for_perfect_conversion(str(input_file), docx_info)
|
|
|
if processed_docx != str(input_file):
|
|
|
print("🔧 Using preprocessed DOCX for conversion")
|
|
|
input_file = Path(processed_docx)
|
|
|
|
|
|
|
|
|
needs_aggressive_optimization = (
|
|
|
docx_info.get('has_textboxes', False) or
|
|
|
docx_info.get('has_smartart', False) or
|
|
|
docx_info.get('has_complex_shapes', False) or
|
|
|
len(docx_info.get('table_structure_issues', [])) > 2 or
|
|
|
docx_info.get('text_content_length', 0) > 100000
|
|
|
)
|
|
|
|
|
|
if needs_aggressive_optimization:
|
|
|
print("⚠️ Complex document detected - applying aggressive optimization settings")
|
|
|
|
|
|
conversion_timeout = 180
|
|
|
else:
|
|
|
conversion_timeout = 120
|
|
|
|
|
|
|
|
|
|
|
|
pdf_export_settings = {
|
|
|
|
|
|
"Quality": 100,
|
|
|
"ReduceImageResolution": False,
|
|
|
"MaxImageResolution": 600,
|
|
|
"BitmapResolution": 600,
|
|
|
"ImageResolution": 600,
|
|
|
"JPEGQuality": 100,
|
|
|
"CompressMode": 0,
|
|
|
|
|
|
|
|
|
"EmbedStandardFonts": True,
|
|
|
"FontEmbedding": True,
|
|
|
"UseTaggedPDF": True,
|
|
|
"EnableTextAccessForAccessibilityTools": True,
|
|
|
|
|
|
|
|
|
"ExportFormFields": False,
|
|
|
"FormsType": 0,
|
|
|
"ExportBookmarks": False,
|
|
|
"ExportNotes": False,
|
|
|
"ExportNotesPages": False,
|
|
|
"ExportOnlyNotesPages": False,
|
|
|
"ExportPlaceholders": False,
|
|
|
"ExportHiddenSlides": False,
|
|
|
"SinglePageSheets": False,
|
|
|
"UseTransitionEffects": False,
|
|
|
"IsSkipEmptyPages": False,
|
|
|
"IsAddStream": False,
|
|
|
"AllowDuplicateFieldNames": False,
|
|
|
|
|
|
|
|
|
"ColorMode": 0,
|
|
|
"Watermark": "",
|
|
|
"EncryptFile": False,
|
|
|
"DocumentOpenPassword": "",
|
|
|
"PermissionPassword": "",
|
|
|
"RestrictPermissions": False,
|
|
|
"Printing": 2,
|
|
|
"Changes": 4,
|
|
|
"EnableCopyingOfContent": True,
|
|
|
"SelectPdfVersion": 1,
|
|
|
"ExportLinksRelativeFsys": False,
|
|
|
"PDFViewSelection": 0,
|
|
|
"ConvertOOoTargetToPDFTarget": False,
|
|
|
"ExportBookmarksToPDFDestination": False,
|
|
|
|
|
|
|
|
|
"PreserveEditingInPDF": False,
|
|
|
"ExportFormFieldsAsWidgets": False,
|
|
|
"FormsFormat": 0,
|
|
|
"SubmitFormat": 0,
|
|
|
"AllowDuplicateFieldNames": False,
|
|
|
"ExportEmptyPages": True,
|
|
|
"ViewPDFAfterExport": False,
|
|
|
|
|
|
|
|
|
"UseReferenceXObject": False,
|
|
|
"HideViewerMenubar": False,
|
|
|
"HideViewerToolbar": False,
|
|
|
"HideViewerWindowControls": False,
|
|
|
"ResizeWindowToInitialPage": False,
|
|
|
"CenterWindow": False,
|
|
|
"OpenInFullScreenMode": False,
|
|
|
"DisplayPDFDocumentTitle": False,
|
|
|
|
|
|
|
|
|
"ExportNotesInMargin": False,
|
|
|
"ConvertOOoTargetToPDFTarget": False,
|
|
|
"ExportLinksRelativeFsys": False,
|
|
|
"PDFViewSelection": 0,
|
|
|
"Magnification": 0,
|
|
|
"PageLayout": 0,
|
|
|
"FirstPageOnLeft": False,
|
|
|
"InitialView": 0,
|
|
|
"Magnification": 0
|
|
|
}
|
|
|
|
|
|
|
|
|
pdf_filter = f'pdf:writer_pdf_Export:{json.dumps(pdf_export_settings, separators=(",", ":"))}'
|
|
|
|
|
|
|
|
|
|
|
|
cmd = [
|
|
|
"libreoffice",
|
|
|
"--headless",
|
|
|
"--invisible",
|
|
|
"--nodefault",
|
|
|
"--nolockcheck",
|
|
|
"--nologo",
|
|
|
"--norestore",
|
|
|
"--nofirststartwizard",
|
|
|
"--safe-mode",
|
|
|
|
|
|
"--disable-extension-update",
|
|
|
"--disable-webupdate",
|
|
|
"--disable-remote-control",
|
|
|
"--disable-notification",
|
|
|
"--disable-oop4all",
|
|
|
"--convert-to", pdf_filter,
|
|
|
"--outdir", str(temp_path),
|
|
|
str(input_file)
|
|
|
]
|
|
|
|
|
|
|
|
|
env = os.environ.copy()
|
|
|
env['HOME'] = config_home
|
|
|
env['XDG_CONFIG_HOME'] = config_home + "/.config"
|
|
|
|
|
|
|
|
|
fontconfig_dir = fontconfig_home + "/.config/fontconfig"
|
|
|
env['FONTCONFIG_PATH'] = fontconfig_dir
|
|
|
env['FONTCONFIG_FILE'] = fontconfig_dir + "/fonts.conf"
|
|
|
|
|
|
|
|
|
script_dir = Path(__file__).parent.absolute()
|
|
|
if 'FONTPATH' in env:
|
|
|
env['FONTPATH'] = f"{script_dir}:{env['FONTPATH']}"
|
|
|
|
|
|
else:
|
|
|
env['FONTPATH'] = str(script_dir)
|
|
|
|
|
|
env['LANG'] = 'ar_SA.UTF-8'
|
|
|
env['LC_ALL'] = 'ar_SA.UTF-8'
|
|
|
env['LC_CTYPE'] = 'ar_SA.UTF-8'
|
|
|
env['LC_NUMERIC'] = 'ar_SA.UTF-8'
|
|
|
env['LC_TIME'] = 'ar_SA.UTF-8'
|
|
|
env['LC_COLLATE'] = 'ar_SA.UTF-8'
|
|
|
env['LC_MONETARY'] = 'ar_SA.UTF-8'
|
|
|
env['LC_MESSAGES'] = 'ar_SA.UTF-8'
|
|
|
env['LC_PAPER'] = 'ar_SA.UTF-8'
|
|
|
env['LC_NAME'] = 'ar_SA.UTF-8'
|
|
|
env['LC_ADDRESS'] = 'ar_SA.UTF-8'
|
|
|
env['LC_TELEPHONE'] = 'ar_SA.UTF-8'
|
|
|
env['LC_MEASUREMENT'] = 'ar_SA.UTF-8'
|
|
|
env['LC_IDENTIFICATION'] = 'ar_SA.UTF-8'
|
|
|
|
|
|
env['SAL_USE_VCLPLUGIN'] = 'svp'
|
|
|
env['DISPLAY'] = ':99'
|
|
|
|
|
|
env['OOO_FORCE_DESKTOP'] = 'gnome'
|
|
|
env['SAL_NO_MOUSEGRABS'] = '1'
|
|
|
|
|
|
env['SAL_RTL_ENABLED'] = '1'
|
|
|
env['OOO_DISABLE_RECOVERY'] = '1'
|
|
|
|
|
|
|
|
|
env['SAL_DISABLE_JAVA_SECURITY'] = '1'
|
|
|
env['SAL_DISABLE_JAVA'] = '1'
|
|
|
env['SAL_JAVA_DISABLE_SECURITY'] = '1'
|
|
|
|
|
|
|
|
|
env['UNO_PATH'] = '/usr/lib/libreoffice/program'
|
|
|
|
|
|
|
|
|
env['LIBO_JAVA_PARALLEL'] = '0'
|
|
|
env['LIBO_DISABLE_JAVA'] = '1'
|
|
|
|
|
|
|
|
|
env['SAL_DISABLE_OPENCL'] = '1'
|
|
|
env['SAL_DISABLE_VCLPLUGIN'] = '1'
|
|
|
|
|
|
print(f"🚀 Executing LibreOffice conversion with MAXIMUM quality settings...")
|
|
|
print(f"Command: {' '.join(cmd[:11])}... [truncated for readability]")
|
|
|
print(f"Environment: HOME={env.get('HOME', 'default')}, LANG={env.get('LANG', 'default')}")
|
|
|
|
|
|
result = subprocess.run(
|
|
|
cmd,
|
|
|
capture_output=True,
|
|
|
text=True,
|
|
|
timeout=conversion_timeout,
|
|
|
cwd=temp_path,
|
|
|
env=env
|
|
|
)
|
|
|
|
|
|
print(f"📊 LibreOffice execution completed:")
|
|
|
print(f" • Return code: {result.returncode}")
|
|
|
print(f" • Output length: {len(result.stdout)} chars")
|
|
|
print(f" • Error length: {len(result.stderr)} chars")
|
|
|
|
|
|
if result.stdout:
|
|
|
print(f" • LibreOffice stdout: {result.stdout[:200]}...")
|
|
|
if result.stderr:
|
|
|
print(f" • LibreOffice stderr: {result.stderr[:200]}...")
|
|
|
|
|
|
|
|
|
if result.returncode != 0:
|
|
|
print("⚠️ LibreOffice conversion failed, trying fallback with unoconv...")
|
|
|
try:
|
|
|
unoconv_cmd = [
|
|
|
"unoconv",
|
|
|
"-f", "pdf",
|
|
|
"-o", str(temp_path),
|
|
|
str(input_file)
|
|
|
]
|
|
|
|
|
|
print(f"🚀 Executing unoconv conversion...")
|
|
|
print(f"Command: {' '.join(unoconv_cmd)}")
|
|
|
|
|
|
unoconv_result = subprocess.run(
|
|
|
unoconv_cmd,
|
|
|
capture_output=True,
|
|
|
text=True,
|
|
|
timeout=conversion_timeout,
|
|
|
cwd=temp_path,
|
|
|
env=env
|
|
|
)
|
|
|
|
|
|
print(f"📊 unoconv execution completed:")
|
|
|
print(f" • Return code: {unoconv_result.returncode}")
|
|
|
print(f" • Output length: {len(unoconv_result.stdout)} chars")
|
|
|
print(f" • Error length: {len(unoconv_result.stderr)} chars")
|
|
|
|
|
|
if unoconv_result.stdout:
|
|
|
print(f" • unoconv stdout: {unoconv_result.stdout[:200]}...")
|
|
|
if unoconv_result.stderr:
|
|
|
print(f" • unoconv stderr: {unoconv_result.stderr[:200]}...")
|
|
|
|
|
|
|
|
|
if unoconv_result.returncode == 0:
|
|
|
result = unoconv_result
|
|
|
print("✅ unoconv conversion successful")
|
|
|
else:
|
|
|
print("❌ unoconv conversion also failed")
|
|
|
except Exception as unoconv_error:
|
|
|
print(f"❌ unoconv conversion error: {unoconv_error}")
|
|
|
|
|
|
if result.returncode != 0:
|
|
|
|
|
|
error_analysis = analyze_conversion_error(result.stderr, result.stdout, docx_info)
|
|
|
error_msg = f"❌ Conversion failed with detailed analysis:\n\n"
|
|
|
error_msg += f"🔍 Error Analysis:\n{error_analysis}\n\n"
|
|
|
error_msg += f"📋 Technical Details:\n"
|
|
|
error_msg += f"• Return Code: {result.returncode}\n"
|
|
|
error_msg += f"• LibreOffice Error: {result.stderr[:300]}...\n"
|
|
|
error_msg += f"• Document Info: Tables={docx_info['has_tables']}, Images={docx_info['has_images']}\n"
|
|
|
|
|
|
print(f"❌ CONVERSION FAILED: {error_msg}")
|
|
|
|
|
|
|
|
|
if final_output_path:
|
|
|
try:
|
|
|
os.unlink(final_output_path)
|
|
|
except:
|
|
|
pass
|
|
|
return None, error_msg
|
|
|
|
|
|
|
|
|
print(f"Looking for PDF files in: {temp_path}")
|
|
|
all_files = list(temp_path.iterdir())
|
|
|
print(f"Files in temp directory: {all_files}")
|
|
|
|
|
|
|
|
|
pdf_files = [f for f in all_files if f.suffix.lower() == '.pdf']
|
|
|
|
|
|
if not pdf_files:
|
|
|
|
|
|
if final_output_path:
|
|
|
try:
|
|
|
os.unlink(final_output_path)
|
|
|
except:
|
|
|
pass
|
|
|
return None, f"No PDF file was generated by LibreOffice. Files found: {[f.name for f in all_files]}"
|
|
|
|
|
|
|
|
|
temp_pdf = pdf_files[0]
|
|
|
print(f"✅ Found PDF file: {temp_pdf}")
|
|
|
|
|
|
if not temp_pdf.exists():
|
|
|
|
|
|
if final_output_path:
|
|
|
try:
|
|
|
os.unlink(final_output_path)
|
|
|
except:
|
|
|
pass
|
|
|
return None, "PDF file was not generated by LibreOffice"
|
|
|
|
|
|
|
|
|
shutil.copy2(temp_pdf, final_output_path)
|
|
|
|
|
|
|
|
|
print("🔍 Validating PDF output...")
|
|
|
pdf_validation = validate_pdf_output(final_output_path, docx_info)
|
|
|
|
|
|
print("🔧 Post-processing PDF for perfect formatting...")
|
|
|
post_process_results = post_process_pdf_for_perfect_formatting(final_output_path, docx_info)
|
|
|
|
|
|
|
|
|
quality_report = generate_comprehensive_quality_report(docx_info, pdf_validation, post_process_results)
|
|
|
quality_score = calculate_quality_score(docx_info, pdf_validation, post_process_results)
|
|
|
|
|
|
|
|
|
if quality_score >= 95:
|
|
|
success_msg = f"🌟 EXCELLENT conversion with {quality_score:.1f}% formatting accuracy!\n\n"
|
|
|
elif quality_score >= 85:
|
|
|
success_msg = f"✅ HIGH-QUALITY conversion with {quality_score:.1f}% formatting accuracy!\n\n"
|
|
|
elif quality_score >= 75:
|
|
|
success_msg = f"👍 GOOD conversion with {quality_score:.1f}% formatting accuracy!\n\n"
|
|
|
else:
|
|
|
success_msg = f"⚠️ Conversion completed with {quality_score:.1f}% accuracy - improvements suggested!\n\n"
|
|
|
|
|
|
success_msg += quality_report
|
|
|
|
|
|
|
|
|
if quality_score < 80:
|
|
|
success_msg += f"\n\n💡 TIP: For better results, try simplifying the document structure or removing complex elements before conversion."
|
|
|
|
|
|
return final_output_path, success_msg
|
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
|
timeout_msg = "⏱️ Conversion timed out - Document is too complex for current processing limits\n\n"
|
|
|
timeout_msg += "🔍 Timeout Analysis:\n"
|
|
|
timeout_msg += f"• Document has tables: {docx_info.get('has_tables', 'Unknown')}\n"
|
|
|
timeout_msg += f"• Document has images: {docx_info.get('has_images', 'Unknown')}\n"
|
|
|
timeout_msg += f"• Text content length: {docx_info.get('text_content_length', 'Unknown')} characters\n"
|
|
|
timeout_msg += f"• Font families detected: {len(docx_info.get('font_families', []))}\n\n"
|
|
|
timeout_msg += "💡 Suggestions:\n"
|
|
|
timeout_msg += "• Try with a simpler document first\n"
|
|
|
timeout_msg += "• Remove complex tables or images temporarily\n"
|
|
|
timeout_msg += "• Split large documents into smaller sections\n"
|
|
|
timeout_msg += "• Ensure document is not corrupted\n"
|
|
|
|
|
|
print(f"❌ TIMEOUT ERROR: {timeout_msg}")
|
|
|
|
|
|
|
|
|
if final_output_path:
|
|
|
try:
|
|
|
os.unlink(final_output_path)
|
|
|
except:
|
|
|
pass
|
|
|
return None, timeout_msg
|
|
|
except Exception as e:
|
|
|
|
|
|
exception_msg = f"❌ Unexpected error during conversion\n\n"
|
|
|
exception_msg += f"🔍 Error Details:\n"
|
|
|
exception_msg += f"• Error Type: {type(e).__name__}\n"
|
|
|
exception_msg += f"• Error Message: {str(e)}\n"
|
|
|
|
|
|
if 'docx_info' in locals():
|
|
|
exception_msg += f"• Document Analysis:\n"
|
|
|
exception_msg += f" - Has tables: {docx_info.get('has_tables', 'Unknown')}\n"
|
|
|
exception_msg += f" - Has images: {docx_info.get('has_images', 'Unknown')}\n"
|
|
|
exception_msg += f" - Content length: {docx_info.get('text_content_length', 'Unknown')}\n"
|
|
|
|
|
|
exception_msg += f"\n💡 Recovery Suggestions:\n"
|
|
|
exception_msg += f"• Verify the DOCX file is not corrupted\n"
|
|
|
exception_msg += f"• Try opening the file in Microsoft Word first\n"
|
|
|
exception_msg += f"• Ensure the file is a valid .docx format\n"
|
|
|
exception_msg += f"• Check file size is reasonable (< 50MB)\n"
|
|
|
exception_msg += f"• Try with a simpler test document\n"
|
|
|
|
|
|
print(f"❌ EXCEPTION ERROR: {exception_msg}")
|
|
|
print(f"Full exception details: {repr(e)}")
|
|
|
|
|
|
|
|
|
if final_output_path:
|
|
|
try:
|
|
|
os.unlink(final_output_path)
|
|
|
except:
|
|
|
pass
|
|
|
return None, exception_msg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|