Spaces:
Running
Running
| import os | |
| import time | |
| import fitz # PyMuPDF | |
| from docx import Document | |
| from docx.shared import Pt, RGBColor, Inches | |
| from docx.enum.text import WD_PARAGRAPH_ALIGNMENT | |
| from PIL import Image | |
| import io | |
| import gradio as gr | |
| import subprocess | |
| # Install required system dependencies | |
| def install_dependencies(): | |
| subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| install_dependencies() | |
| def convert_pdf_to_word(pdf_file, filename): | |
| """Convert PDF to Word with maximum fidelity""" | |
| try: | |
| # Prepare output path | |
| docx_path = filename.replace('.pdf', '_converted.docx') | |
| # First try with pdf2docx | |
| try: | |
| from pdf2docx import Converter | |
| cv = Converter(pdf_file.name) | |
| cv.convert(docx_path, | |
| start=0, | |
| end=None, | |
| keep_layout=True) | |
| cv.close() | |
| except Exception as e: | |
| print(f"Primary conversion method failed: {e}, trying fallback...") | |
| # Fallback to libreoffice | |
| subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name], | |
| stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx")) | |
| if os.path.exists(temp_docx): | |
| os.rename(temp_docx, docx_path) | |
| else: | |
| raise Exception("LibreOffice conversion failed") | |
| # Enhance the output with PyMuPDF for better formatting | |
| try: | |
| doc = Document(docx_path) | |
| pdf_doc = fitz.open(pdf_file.name) | |
| # Process each page for precise formatting | |
| for page_num in range(len(pdf_doc)): | |
| page = pdf_doc.load_page(page_num) | |
| blocks = page.get_text("dict")["blocks"] | |
| for b in blocks: | |
| if "lines" in b: | |
| for line in b["lines"]: | |
| for span in line["spans"]: | |
| # Match text style in Word doc | |
| for paragraph in doc.paragraphs: | |
| if span["text"].strip() and span["text"].strip() in paragraph.text: | |
| for run in paragraph.runs: | |
| if span["text"].strip() in run.text: | |
| # Set font properties | |
| run.font.name = span["font"] | |
| run.font.size = Pt(span["size"]) | |
| # Set color | |
| if "color" in span: | |
| color = span["color"] | |
| r = (color >> 16) & 0xff | |
| g = (color >> 8) & 0xff | |
| b = color & 0xff | |
| run.font.color.rgb = RGBColor(r, g, b) | |
| # Set styles | |
| run.font.bold = bool(span["flags"] & 2 ** 4) | |
| run.font.italic = bool(span["flags"] & 2 ** 1) | |
| run.font.underline = bool(span["flags"] & 2 ** 2) | |
| # Handle images with precise positioning | |
| for page_num in range(len(pdf_doc)): | |
| page = pdf_doc.load_page(page_num) | |
| image_list = page.get_images(full=True) | |
| for img_index, img in enumerate(image_list): | |
| xref = img[0] | |
| base_image = pdf_doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| # Convert to PIL Image | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| image_path = f"/tmp/img_{page_num}_{img_index}.png" | |
| image.save(image_path) | |
| # Add to document with original dimensions | |
| doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72)) | |
| os.remove(image_path) | |
| pdf_doc.close() | |
| doc.save(docx_path) | |
| except Exception as e: | |
| print(f"Formatting enhancement failed: {e}, using basic conversion") | |
| return docx_path | |
| except Exception as e: | |
| raise Exception(f"Conversion failed: {str(e)}") | |
| def process_pdf(file): | |
| if not file: | |
| raise gr.Error("Please upload a PDF file first") | |
| if not file.name.lower().endswith('.pdf'): | |
| raise gr.Error("Please upload a PDF file") | |
| try: | |
| start_time = time.time() | |
| output_path = convert_pdf_to_word(file, file.name) | |
| conversion_time = time.time() - start_time | |
| return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds" | |
| except Exception as e: | |
| raise gr.Error(f"Conversion failed: {str(e)}") | |
| # Create Gradio interface | |
| with gr.Blocks(title="PDF to Word Converter") as demo: | |
| gr.Markdown("# PDF to Word Converter") | |
| gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
| convert_btn = gr.Button("Convert to Word", variant="primary") | |
| with gr.Column(): | |
| status = gr.Textbox(label="Status") | |
| file_output = gr.File(label="Download Word File") | |
| convert_btn.click( | |
| fn=process_pdf, | |
| inputs=file_input, | |
| outputs=[file_output, status] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |