mobenta commited on
Commit
e45c224
·
verified ·
1 Parent(s): bcfb2da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -55
app.py CHANGED
@@ -1,117 +1,132 @@
1
  # app.py
2
  import gradio as gr
3
- from docx2pdf import convert
4
  import os
5
  import zipfile
6
  import shutil
 
7
  from pathlib import Path
8
 
 
9
  # Set up temporary directories
10
- TEMP_DIR = Path("./temp_files")
11
  TEMP_INPUT_DIR = TEMP_DIR / "input"
12
  TEMP_OUTPUT_DIR = TEMP_DIR / "output"
13
 
14
- # Ensure directories exist
15
- for d in [TEMP_INPUT_DIR, TEMP_OUTPUT_DIR]:
16
- d.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def convert_docs_to_pdf(doc_files):
19
  """
20
- Takes a list of uploaded docx files, converts them to PDF,
21
  and zips the results for download.
22
  """
23
  if not doc_files:
24
- return None, "Please upload one or more .docx or .doc files."
25
 
26
- # 1. Clean up and prepare directories for a new conversion run
27
- try:
28
- if TEMP_INPUT_DIR.exists(): shutil.rmtree(TEMP_INPUT_DIR)
29
- if TEMP_OUTPUT_DIR.exists(): shutil.rmtree(TEMP_OUTPUT_DIR)
30
- TEMP_INPUT_DIR.mkdir(parents=True)
31
- TEMP_OUTPUT_DIR.mkdir(parents=True)
32
- except Exception as e:
33
- return None, f"Error preparing directories: {e}"
34
 
35
  success_count = 0
 
36
 
37
- # 2. Convert each file
38
  for file_obj in doc_files:
39
  original_filepath = file_obj.name
40
  filename = Path(original_filepath).name
41
 
42
- # Determine the target output path for the PDF
43
- output_filename = filename.rsplit('.', 1)[0] + '.pdf'
44
- output_filepath = TEMP_OUTPUT_DIR / output_filename
45
-
46
- try:
47
- # Copy file to a temp input dir, which can be useful if docx2pdf
48
- # has issues with temporary Gradio paths on some systems.
49
- input_file_copy = TEMP_INPUT_DIR / filename
50
- shutil.copy(original_filepath, input_file_copy)
51
 
52
- # Perform the conversion
53
- # The 'output_file' parameter specifies the single output PDF path.
54
- # When converting a single file, this works.
55
- # Note: docx2pdf handles doc and docx automatically.
56
- convert(input_file_copy, output_filepath)
57
  success_count += 1
58
-
59
- except Exception as e:
60
- print(f"Error converting {filename}: {e}")
61
- # Optionally, you could write a placeholder PDF to inform the user of the failure
62
-
63
  if success_count == 0:
64
- return None, "No files were converted successfully. Ensure they are valid .docx or .doc files."
 
65
 
66
- # 3. Zip the results
67
  zip_filename = TEMP_DIR / "converted_pdfs.zip"
68
 
69
- # Check if the zip file already exists and remove it
70
- if zip_filename.exists():
71
- os.remove(zip_filename)
72
-
73
- with zipfile.ZipFile(zip_filename, 'w') as zipf:
74
  for file in TEMP_OUTPUT_DIR.iterdir():
75
- # Add files from the output directory to the zip file
76
  zipf.write(file, arcname=file.name)
77
 
78
- # Return the path to the zip file for Gradio to offer as a download
79
- return str(zip_filename), f"Successfully converted {success_count} files and zipped them."
 
 
 
80
 
81
  # --- Gradio Interface Definition ---
82
- # Use gr.Blocks for a more flexible layout
83
  with gr.Blocks(title="Multi DOC/DOCX to PDF Converter") as demo:
84
  gr.Markdown(
85
  """
86
  # Multi DOC/DOCX to PDF Converter 📄➡️📜
87
  Upload multiple Microsoft Word files (.doc or .docx) and get them all converted to PDF in a single downloadable ZIP file.
88
 
89
- **Note:** This app relies on the `docx2pdf` library and LibreOffice on the backend for accurate formatting preservation.
90
  """
91
  )
92
 
93
  with gr.Row():
94
- # Input component: File component set to accept multiple files
95
  file_input = gr.File(
96
  file_count="multiple",
97
  label="Upload Word Files (.docx or .doc)",
98
  file_types=[".doc", ".docx"]
99
  )
100
 
101
- # Output components
102
  with gr.Column():
103
- download_zip = gr.File(label="Download Converted PDFs (ZIP)", visible=False)
104
  status_message = gr.Textbox(label="Status", value="Upload your files and click Convert.", interactive=False)
 
105
 
106
- convert_button = gr.Button("Convert to PDF", variant="primary")
107
-
108
  # Connect the button click to the conversion function
109
  convert_button.click(
110
  fn=convert_docs_to_pdf,
111
  inputs=[file_input],
112
- outputs=[download_zip, status_message],
113
- # Show the download component only after successful conversion
114
- postprocess=[lambda x: gr.update(visible=True)]
115
  )
116
 
117
  if __name__ == "__main__":
 
1
  # app.py
2
  import gradio as gr
 
3
  import os
4
  import zipfile
5
  import shutil
6
+ import subprocess
7
  from pathlib import Path
8
 
9
+ # --- Configuration ---
10
  # Set up temporary directories
11
+ TEMP_DIR = Path("./temp_conversion_data")
12
  TEMP_INPUT_DIR = TEMP_DIR / "input"
13
  TEMP_OUTPUT_DIR = TEMP_DIR / "output"
14
 
15
+ def setup_conversion_dirs():
16
+ """Cleans up and ensures all required directories exist before a new conversion."""
17
+ try:
18
+ if TEMP_DIR.exists():
19
+ shutil.rmtree(TEMP_DIR)
20
+
21
+ TEMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
22
+ TEMP_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
23
+ return True
24
+ except Exception as e:
25
+ print(f"Error setting up directories: {e}")
26
+ return False
27
+
28
+ def convert_single_file_with_unoconv(input_path, output_dir):
29
+ """
30
+ Converts a single DOC/DOCX to PDF using the unoconv command-line utility.
31
+ This is the most reliable method on a Linux environment like Hugging Face Spaces.
32
+ """
33
+ filename = Path(input_path).name
34
+
35
+ try:
36
+ # Command: unoconv -f pdf -o [output_dir] [input_file]
37
+ result = subprocess.run(
38
+ ['unoconv', '-f', 'pdf', '-o', str(output_dir), str(input_path)],
39
+ check=True, # Raises CalledProcessError on non-zero exit code
40
+ capture_output=True,
41
+ text=True,
42
+ timeout=60 # Timeout for a single conversion (60 seconds)
43
+ )
44
+ print(f"Successfully converted {filename}. Output: {result.stdout}")
45
+ return True
46
+ except subprocess.CalledProcessError as e:
47
+ # Detailed error log for debugging
48
+ print(f"UNOCONV FAILED for {filename}. Stderr: {e.stderr}, Stdout: {e.stdout}")
49
+ return False
50
+ except subprocess.TimeoutExpired:
51
+ print(f"Conversion of {filename} timed out.")
52
+ return False
53
+ except Exception as e:
54
+ print(f"An unexpected error occurred during conversion of {filename}: {e}")
55
+ return False
56
 
57
  def convert_docs_to_pdf(doc_files):
58
  """
59
+ Takes a list of uploaded docx files, converts them to PDF using unoconv,
60
  and zips the results for download.
61
  """
62
  if not doc_files:
63
+ return gr.update(visible=False), "Please upload one or more .docx or .doc files."
64
 
65
+ if not setup_conversion_dirs():
66
+ return gr.update(visible=False), "Error: Could not set up temporary directories."
 
 
 
 
 
 
67
 
68
  success_count = 0
69
+ total_count = len(doc_files)
70
 
71
+ # 1. Process each uploaded file
72
  for file_obj in doc_files:
73
  original_filepath = file_obj.name
74
  filename = Path(original_filepath).name
75
 
76
+ # Copy the file to the clean input directory for unoconv
77
+ input_file_copy = TEMP_INPUT_DIR / filename
78
+ shutil.copy(original_filepath, input_file_copy)
 
 
 
 
 
 
79
 
80
+ if convert_single_file_with_unoconv(input_file_copy, TEMP_OUTPUT_DIR):
 
 
 
 
81
  success_count += 1
82
+
 
 
 
 
83
  if success_count == 0:
84
+ # Hide download component if conversion failed
85
+ return gr.update(visible=False), "No files were converted successfully. Check the Space logs for details."
86
 
87
+ # 2. Zip the successful results
88
  zip_filename = TEMP_DIR / "converted_pdfs.zip"
89
 
90
+ with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
91
+ # Only zip the successfully created PDF files
 
 
 
92
  for file in TEMP_OUTPUT_DIR.iterdir():
93
+ # The PDF file name will be the original name with a .pdf extension
94
  zipf.write(file, arcname=file.name)
95
 
96
+ # 3. Return the results
97
+ status = f"Successfully converted {success_count} of {total_count} files and zipped them. Download ready."
98
+
99
+ # Show download component and return zip path
100
+ return gr.update(value=str(zip_filename), visible=True), status
101
 
102
  # --- Gradio Interface Definition ---
 
103
  with gr.Blocks(title="Multi DOC/DOCX to PDF Converter") as demo:
104
  gr.Markdown(
105
  """
106
  # Multi DOC/DOCX to PDF Converter 📄➡️📜
107
  Upload multiple Microsoft Word files (.doc or .docx) and get them all converted to PDF in a single downloadable ZIP file.
108
 
109
+ **Fixes**: This version uses the **`unoconv`** utility with LibreOffice for reliable conversion on Hugging Face's Linux backend, resolving the `docx2pdf` error.
110
  """
111
  )
112
 
113
  with gr.Row():
 
114
  file_input = gr.File(
115
  file_count="multiple",
116
  label="Upload Word Files (.docx or .doc)",
117
  file_types=[".doc", ".docx"]
118
  )
119
 
 
120
  with gr.Column():
121
+ convert_button = gr.Button("Convert to PDF", variant="primary")
122
  status_message = gr.Textbox(label="Status", value="Upload your files and click Convert.", interactive=False)
123
+ download_zip = gr.File(label="Download Converted PDFs (ZIP)", visible=False)
124
 
 
 
125
  # Connect the button click to the conversion function
126
  convert_button.click(
127
  fn=convert_docs_to_pdf,
128
  inputs=[file_input],
129
+ outputs=[download_zip, status_message]
 
 
130
  )
131
 
132
  if __name__ == "__main__":