|
|
|
|
|
""" |
|
|
Enhanced DOCX to PDF Converter |
|
|
Professional FastAPI Backend with Docker Support |
|
|
""" |
|
|
|
|
|
import os |
|
|
import tempfile |
|
|
import shutil |
|
|
import subprocess |
|
|
import logging |
|
|
import uuid |
|
|
from pathlib import Path |
|
|
from typing import Optional, List |
|
|
import base64 |
|
|
import json |
|
|
|
|
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks |
|
|
from fastapi.responses import FileResponse, JSONResponse |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from pydantic import BaseModel |
|
|
|
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
|
) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
app = FastAPI( |
|
|
title="Enhanced DOCX to PDF Converter", |
|
|
description="Professional API for converting DOCX files to PDF with perfect formatting preservation", |
|
|
version="2.0.0" |
|
|
) |
|
|
|
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
MAX_FILE_SIZE = 50 * 1024 * 1024 |
|
|
SUPPORTED_MIME_TYPES = ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"] |
|
|
|
|
|
class ConversionRequest(BaseModel): |
|
|
"""Request model for base64 conversion""" |
|
|
file_content: str |
|
|
filename: str |
|
|
|
|
|
class BatchConversionRequest(BaseModel): |
|
|
"""Request model for batch conversion""" |
|
|
files: List[ConversionRequest] |
|
|
|
|
|
class ConversionResponse(BaseModel): |
|
|
"""Response model for conversion results""" |
|
|
success: bool |
|
|
pdf_url: Optional[str] = None |
|
|
message: Optional[str] = None |
|
|
error: Optional[str] = None |
|
|
|
|
|
def setup_libreoffice(): |
|
|
"""Ensure LibreOffice is properly configured""" |
|
|
try: |
|
|
result = subprocess.run( |
|
|
["libreoffice", "--version"], |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=10 |
|
|
) |
|
|
if result.returncode != 0: |
|
|
raise Exception("LibreOffice not found or not working") |
|
|
|
|
|
logger.info(f"LibreOffice version: {result.stdout.strip()}") |
|
|
return True |
|
|
except Exception as e: |
|
|
logger.error(f"LibreOffice setup error: {e}") |
|
|
return False |
|
|
|
|
|
def convert_docx_to_pdf(input_path: str, output_path: str) -> bool: |
|
|
"""Convert DOCX to PDF using LibreOffice""" |
|
|
try: |
|
|
|
|
|
cmd = [ |
|
|
"libreoffice", |
|
|
"--headless", |
|
|
"--convert-to", "pdf", |
|
|
"--outdir", os.path.dirname(output_path), |
|
|
input_path |
|
|
] |
|
|
|
|
|
result = subprocess.run( |
|
|
cmd, |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=120 |
|
|
) |
|
|
|
|
|
if result.returncode != 0: |
|
|
logger.error(f"Conversion failed: {result.stderr}") |
|
|
return False |
|
|
|
|
|
|
|
|
if not os.path.exists(output_path): |
|
|
logger.error("PDF file was not created") |
|
|
return False |
|
|
|
|
|
logger.info(f"Successfully converted {input_path} to {output_path}") |
|
|
return True |
|
|
|
|
|
except subprocess.TimeoutExpired: |
|
|
logger.error("Conversion timed out") |
|
|
return False |
|
|
except Exception as e: |
|
|
logger.error(f"Conversion error: {e}") |
|
|
return False |
|
|
|
|
|
def validate_file(file_path: str, mime_type: str) -> bool: |
|
|
"""Validate uploaded file""" |
|
|
|
|
|
if os.path.getsize(file_path) > MAX_FILE_SIZE: |
|
|
return False |
|
|
|
|
|
|
|
|
if mime_type not in SUPPORTED_MIME_TYPES: |
|
|
return False |
|
|
|
|
|
|
|
|
if not file_path.lower().endswith('.docx'): |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
@app.on_event("startup") |
|
|
async def startup_event(): |
|
|
"""Initialize application on startup""" |
|
|
logger.info("Starting Enhanced DOCX to PDF Converter...") |
|
|
|
|
|
|
|
|
if not setup_libreoffice(): |
|
|
logger.warning("LibreOffice setup failed - conversions may not work") |
|
|
|
|
|
|
|
|
os.makedirs("/tmp/conversions", exist_ok=True) |
|
|
|
|
|
logger.info("Application started successfully") |
|
|
|
|
|
@app.get("/health") |
|
|
async def health_check(): |
|
|
"""Health check endpoint""" |
|
|
return {"status": "healthy", "version": "2.0.0"} |
|
|
|
|
|
@app.post("/convert", response_model=ConversionResponse) |
|
|
async def convert_docx( |
|
|
background_tasks: BackgroundTasks, |
|
|
file: Optional[UploadFile] = File(None), |
|
|
file_content: Optional[str] = Form(None), |
|
|
filename: Optional[str] = Form(None) |
|
|
): |
|
|
""" |
|
|
Convert DOCX to PDF |
|
|
|
|
|
Supports two input methods: |
|
|
1. Multipart file upload (file parameter) |
|
|
2. Base64 encoded content (file_content and filename parameters) |
|
|
""" |
|
|
temp_dir = None |
|
|
input_path = None |
|
|
output_path = None |
|
|
|
|
|
try: |
|
|
|
|
|
temp_dir = tempfile.mkdtemp(dir="/tmp/conversions") |
|
|
|
|
|
|
|
|
if file and file.filename: |
|
|
|
|
|
if not validate_file(file.filename, file.content_type or ""): |
|
|
raise HTTPException(status_code=400, detail="Invalid file type or size") |
|
|
|
|
|
|
|
|
input_path = os.path.join(temp_dir, file.filename) |
|
|
with open(input_path, "wb") as buffer: |
|
|
content = await file.read() |
|
|
buffer.write(content) |
|
|
|
|
|
|
|
|
elif file_content and filename: |
|
|
|
|
|
if not filename.lower().endswith('.docx'): |
|
|
raise HTTPException(status_code=400, detail="Filename must have .docx extension") |
|
|
|
|
|
|
|
|
try: |
|
|
file_data = base64.b64decode(file_content) |
|
|
except Exception: |
|
|
raise HTTPException(status_code=400, detail="Invalid base64 content") |
|
|
|
|
|
|
|
|
input_path = os.path.join(temp_dir, filename) |
|
|
with open(input_path, "wb") as buffer: |
|
|
buffer.write(file_data) |
|
|
|
|
|
|
|
|
if not validate_file(input_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"): |
|
|
raise HTTPException(status_code=400, detail="Invalid file content") |
|
|
|
|
|
else: |
|
|
raise HTTPException(status_code=400, detail="Either file or file_content+filename must be provided") |
|
|
|
|
|
|
|
|
output_filename = os.path.splitext(os.path.basename(input_path))[0] + ".pdf" |
|
|
output_path = os.path.join(temp_dir, output_filename) |
|
|
|
|
|
|
|
|
if not convert_docx_to_pdf(input_path, output_path): |
|
|
raise HTTPException(status_code=500, detail="Conversion failed") |
|
|
|
|
|
|
|
|
pdf_url = f"/download/{os.path.basename(temp_dir)}/{output_filename}" |
|
|
return ConversionResponse( |
|
|
success=True, |
|
|
pdf_url=pdf_url, |
|
|
message="Conversion successful" |
|
|
) |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Conversion error: {e}") |
|
|
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}") |
|
|
finally: |
|
|
|
|
|
pass |
|
|
|
|
|
@app.get("/download/{temp_id}/{filename}") |
|
|
async def download_pdf(temp_id: str, filename: str): |
|
|
"""Download converted PDF file""" |
|
|
try: |
|
|
file_path = f"/tmp/conversions/{temp_id}/{filename}" |
|
|
|
|
|
if not os.path.exists(file_path): |
|
|
raise HTTPException(status_code=404, detail="File not found") |
|
|
|
|
|
return FileResponse( |
|
|
path=file_path, |
|
|
filename=filename, |
|
|
media_type='application/pdf' |
|
|
) |
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
logger.error(f"Download error: {e}") |
|
|
raise HTTPException(status_code=500, detail="Download failed") |
|
|
|
|
|
@app.post("/convert/batch", response_model=List[ConversionResponse]) |
|
|
async def batch_convert(request: BatchConversionRequest): |
|
|
""" |
|
|
Batch convert multiple DOCX files to PDF |
|
|
""" |
|
|
results = [] |
|
|
|
|
|
for file_req in request.files: |
|
|
try: |
|
|
|
|
|
temp_dir = tempfile.mkdtemp(dir="/tmp/conversions") |
|
|
|
|
|
|
|
|
try: |
|
|
file_data = base64.b64decode(file_req.file_content) |
|
|
except Exception: |
|
|
results.append(ConversionResponse( |
|
|
success=False, |
|
|
error="Invalid base64 content" |
|
|
)) |
|
|
continue |
|
|
|
|
|
|
|
|
input_path = os.path.join(temp_dir, file_req.filename) |
|
|
with open(input_path, "wb") as buffer: |
|
|
buffer.write(file_data) |
|
|
|
|
|
|
|
|
if not validate_file(input_path, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"): |
|
|
results.append(ConversionResponse( |
|
|
success=False, |
|
|
error="Invalid file content" |
|
|
)) |
|
|
continue |
|
|
|
|
|
|
|
|
output_filename = os.path.splitext(os.path.basename(input_path))[0] + ".pdf" |
|
|
output_path = os.path.join(temp_dir, output_filename) |
|
|
|
|
|
|
|
|
if convert_docx_to_pdf(input_path, output_path): |
|
|
pdf_url = f"/download/{os.path.basename(temp_dir)}/{output_filename}" |
|
|
results.append(ConversionResponse( |
|
|
success=True, |
|
|
pdf_url=pdf_url, |
|
|
message="Conversion successful" |
|
|
)) |
|
|
else: |
|
|
results.append(ConversionResponse( |
|
|
success=False, |
|
|
error="Conversion failed" |
|
|
)) |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Batch conversion error: {e}") |
|
|
results.append(ConversionResponse( |
|
|
success=False, |
|
|
error=str(e) |
|
|
)) |
|
|
|
|
|
return results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=8000) |