Spaces:

mobenta
/

doc2pdf

Sleeping

App Files Files Community

mobenta commited on Oct 19, 2025

Commit

e45c224

verified ·

1 Parent(s): bcfb2da

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -55

app.py CHANGED Viewed

@@ -1,117 +1,132 @@
 # app.py
 import gradio as gr
-from docx2pdf import convert
 import os
 import zipfile
 import shutil
 from pathlib import Path
 # Set up temporary directories
-TEMP_DIR = Path("./temp_files")
 TEMP_INPUT_DIR = TEMP_DIR / "input"
 TEMP_OUTPUT_DIR = TEMP_DIR / "output"
-# Ensure directories exist
-for d in [TEMP_INPUT_DIR, TEMP_OUTPUT_DIR]:
-    d.mkdir(parents=True, exist_ok=True)
 def convert_docs_to_pdf(doc_files):
     """
-    Takes a list of uploaded docx files, converts them to PDF,
     and zips the results for download.
     """
     if not doc_files:
-        return None, "Please upload one or more .docx or .doc files."
-    # 1. Clean up and prepare directories for a new conversion run
-    try:
-        if TEMP_INPUT_DIR.exists(): shutil.rmtree(TEMP_INPUT_DIR)
-        if TEMP_OUTPUT_DIR.exists(): shutil.rmtree(TEMP_OUTPUT_DIR)
-        TEMP_INPUT_DIR.mkdir(parents=True)
-        TEMP_OUTPUT_DIR.mkdir(parents=True)
-    except Exception as e:
-        return None, f"Error preparing directories: {e}"
     success_count = 0
-    # 2. Convert each file
     for file_obj in doc_files:
         original_filepath = file_obj.name
         filename = Path(original_filepath).name
-        # Determine the target output path for the PDF
-        output_filename = filename.rsplit('.', 1)[0] + '.pdf'
-        output_filepath = TEMP_OUTPUT_DIR / output_filename
-        try:
-            # Copy file to a temp input dir, which can be useful if docx2pdf
-            # has issues with temporary Gradio paths on some systems.
-            input_file_copy = TEMP_INPUT_DIR / filename
-            shutil.copy(original_filepath, input_file_copy)
-            # Perform the conversion
-            # The 'output_file' parameter specifies the single output PDF path.
-            # When converting a single file, this works.
-            # Note: docx2pdf handles doc and docx automatically.
-            convert(input_file_copy, output_filepath)
             success_count += 1
-        except Exception as e:
-            print(f"Error converting {filename}: {e}")
-            # Optionally, you could write a placeholder PDF to inform the user of the failure
     if success_count == 0:
-        return None, "No files were converted successfully. Ensure they are valid .docx or .doc files."
-    # 3. Zip the results
     zip_filename = TEMP_DIR / "converted_pdfs.zip"
-    # Check if the zip file already exists and remove it
-    if zip_filename.exists():
-        os.remove(zip_filename)
-    with zipfile.ZipFile(zip_filename, 'w') as zipf:
         for file in TEMP_OUTPUT_DIR.iterdir():
-            # Add files from the output directory to the zip file
             zipf.write(file, arcname=file.name)
-    # Return the path to the zip file for Gradio to offer as a download
-    return str(zip_filename), f"Successfully converted {success_count} files and zipped them."
 # --- Gradio Interface Definition ---
-# Use gr.Blocks for a more flexible layout
 with gr.Blocks(title="Multi DOC/DOCX to PDF Converter") as demo:
     gr.Markdown(
         """
         # Multi DOC/DOCX to PDF Converter 📄➡️📜
         Upload multiple Microsoft Word files (.doc or .docx) and get them all converted to PDF in a single downloadable ZIP file.
-        **Note:** This app relies on the `docx2pdf` library and LibreOffice on the backend for accurate formatting preservation.
         """
     )
     with gr.Row():
-        # Input component: File component set to accept multiple files
         file_input = gr.File(
             file_count="multiple",
             label="Upload Word Files (.docx or .doc)",
             file_types=[".doc", ".docx"]
         )
-        # Output components
         with gr.Column():
-            download_zip = gr.File(label="Download Converted PDFs (ZIP)", visible=False)
             status_message = gr.Textbox(label="Status", value="Upload your files and click Convert.", interactive=False)
-    convert_button = gr.Button("Convert to PDF", variant="primary")
     # Connect the button click to the conversion function
     convert_button.click(
         fn=convert_docs_to_pdf,
         inputs=[file_input],
-        outputs=[download_zip, status_message],
-        # Show the download component only after successful conversion
-        postprocess=[lambda x: gr.update(visible=True)]
     )
 if __name__ == "__main__":

 # app.py
 import gradio as gr
 import os
 import zipfile
 import shutil
+import subprocess
 from pathlib import Path
+# --- Configuration ---
 # Set up temporary directories
+TEMP_DIR = Path("./temp_conversion_data")
 TEMP_INPUT_DIR = TEMP_DIR / "input"
 TEMP_OUTPUT_DIR = TEMP_DIR / "output"
+def setup_conversion_dirs():
+    """Cleans up and ensures all required directories exist before a new conversion."""
+    try:
+        if TEMP_DIR.exists():
+            shutil.rmtree(TEMP_DIR)
+        TEMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
+        TEMP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        return True
+    except Exception as e:
+        print(f"Error setting up directories: {e}")
+        return False
+def convert_single_file_with_unoconv(input_path, output_dir):
+    """
+    Converts a single DOC/DOCX to PDF using the unoconv command-line utility.
+    This is the most reliable method on a Linux environment like Hugging Face Spaces.
+    """
+    filename = Path(input_path).name
+    try:
+        # Command: unoconv -f pdf -o [output_dir] [input_file]
+        result = subprocess.run(
+            ['unoconv', '-f', 'pdf', '-o', str(output_dir), str(input_path)],
+            check=True,  # Raises CalledProcessError on non-zero exit code
+            capture_output=True,
+            text=True,
+            timeout=60 # Timeout for a single conversion (60 seconds)
+        )
+        print(f"Successfully converted {filename}. Output: {result.stdout}")
+        return True
+    except subprocess.CalledProcessError as e:
+        # Detailed error log for debugging
+        print(f"UNOCONV FAILED for {filename}. Stderr: {e.stderr}, Stdout: {e.stdout}")
+        return False
+    except subprocess.TimeoutExpired:
+        print(f"Conversion of {filename} timed out.")
+        return False
+    except Exception as e:
+        print(f"An unexpected error occurred during conversion of {filename}: {e}")
+        return False
 def convert_docs_to_pdf(doc_files):
     """
+    Takes a list of uploaded docx files, converts them to PDF using unoconv,
     and zips the results for download.
     """
     if not doc_files:
+        return gr.update(visible=False), "Please upload one or more .docx or .doc files."
+    if not setup_conversion_dirs():
+        return gr.update(visible=False), "Error: Could not set up temporary directories."
     success_count = 0
+    total_count = len(doc_files)
+    # 1. Process each uploaded file
     for file_obj in doc_files:
         original_filepath = file_obj.name
         filename = Path(original_filepath).name
+        # Copy the file to the clean input directory for unoconv
+        input_file_copy = TEMP_INPUT_DIR / filename
+        shutil.copy(original_filepath, input_file_copy)
+        if convert_single_file_with_unoconv(input_file_copy, TEMP_OUTPUT_DIR):
             success_count += 1
     if success_count == 0:
+        # Hide download component if conversion failed
+        return gr.update(visible=False), "No files were converted successfully. Check the Space logs for details."
+    # 2. Zip the successful results
     zip_filename = TEMP_DIR / "converted_pdfs.zip"
+    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        # Only zip the successfully created PDF files
         for file in TEMP_OUTPUT_DIR.iterdir():
+            # The PDF file name will be the original name with a .pdf extension
             zipf.write(file, arcname=file.name)
+    # 3. Return the results
+    status = f"✅ Successfully converted {success_count} of {total_count} files and zipped them. Download ready."
+    # Show download component and return zip path
+    return gr.update(value=str(zip_filename), visible=True), status
 # --- Gradio Interface Definition ---
 with gr.Blocks(title="Multi DOC/DOCX to PDF Converter") as demo:
     gr.Markdown(
         """
         # Multi DOC/DOCX to PDF Converter 📄➡️📜
         Upload multiple Microsoft Word files (.doc or .docx) and get them all converted to PDF in a single downloadable ZIP file.
+        **Fixes**: This version uses the **`unoconv`** utility with LibreOffice for reliable conversion on Hugging Face's Linux backend, resolving the `docx2pdf` error.
         """
     )
     with gr.Row():
         file_input = gr.File(
             file_count="multiple",
             label="Upload Word Files (.docx or .doc)",
             file_types=[".doc", ".docx"]
         )
         with gr.Column():
+            convert_button = gr.Button("Convert to PDF", variant="primary")
             status_message = gr.Textbox(label="Status", value="Upload your files and click Convert.", interactive=False)
+            download_zip = gr.File(label="Download Converted PDFs (ZIP)", visible=False)
     # Connect the button click to the conversion function
     convert_button.click(
         fn=convert_docs_to_pdf,
         inputs=[file_input],
+        outputs=[download_zip, status_message]
     )
 if __name__ == "__main__":