Spaces:

salman555
/

PDFtoDOCX

Running

App Files Files Community

salman555 commited on Jun 7, 2025

Commit

2147e0b

verified ·

1 Parent(s): cac96e4

Upload 5 files

Browse files

Files changed (5) hide show

.gitignore +27 -0
LICENSE +21 -0
README.md +88 -0
app.py +160 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environments
+.env/
+.venv/
+# Distribution / packaging
+build/
+dist/
+*.egg-info/
+# IDE/editor
+.vscode/
+.idea/
+# OS files
+.DS_Store
+Thumbs.db
+# Generated docs / outputs
+*.zip
+*.docx
+*.pdf

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 [Salman Alfarisi]
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+[...standard MIT text continues...]
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+title: PDFtoDOCX
+emoji: 👀
+colorFrom: pink
+colorTo: pink
+sdk: gradio
+sdk_version: 5.33.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# PDF→Word Converter
+A simple Gradio‐based web app to convert one or more PDF files into DOCX,
+format all tables as **Table Grid**, extract any URLs and append them at the end,
+and—if you upload multiple PDFs—bundle all outputs into a ZIP.
+---
+## 🚀 Features
+- **Batch or single** PDF → DOCX conversion
+- Automatically style all tables as **Table Grid**
+- Extracts both annotation and inline URLs into a “Daftar Link” section
+- If multiple PDFs are uploaded, outputs are packaged into a ZIP
+---
+## 📦 Installation
+1. **Clone this repo**
+   ```bash
+   git clone https://github.com/salmanalfarisi11/pdf-to-docx.git
+   cd pdf2word-gradio
+   ```
+2. Create and activate a virtual environment:
+   ```bash
+    python -m venv .venv
+    source .venv/bin/activate   # Linux/macOS
+    .venv\Scripts\activate      # Windows
+   ```
+3. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+## 🚀 Running Locally
+Launch the app on your machine:
+   ```bash
+   python app.py
+   ```
+By default, it will start on http://127.0.0.1:7860/. Open that URL in your browser to access the interface.
+## 🎯 Usage
+1. Open the localhost URL shown in your terminal.
+2. Drag & drop one or more PDF files into the Upload PDF(s) panel.
+3. Click Convert.
+4. When ready, click ⬇️ Download Output to download either a single DOCX or a ZIP of all DOCXs.
+## 🛠️ Dependencies
+- Python 3.8+
+- Gradio ≥ 5.33.0
+- PyMuPDF
+- pdf2docx
+- python-docx
+## 📄 License
+This project is licensed under the [MIT License](LICENSE).
+---
+## 🖋️ Author & Credits
+Developed by **[Salman Alfarisi](https://github.com/salmanalfarisi11)** © 2025
+- GitHub: [salmanalfarisi11](https://github.com/salmanalfarisi11)
+- LinkedIn: [salmanalfarisi11](https://linkedin.com/in/salmanalfarisi11)
+- Instagram: [faris.salman111](https://instagram.com/faris.salman111)

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import re
+import fitz                         # pip install PyMuPDF
+from pdf2docx import Converter     # pip install pdf2docx
+from docx import Document          # pip install python-docx
+import tempfile, os, shutil, zipfile
+import gradio as gr
+# Regex pattern untuk URL
+URL_PATTERN = re.compile(r'(https?://[^\s<>"\'\)\]]+)')
+def convert_pdf_to_word(pdf_file) -> str:
+    """
+    Mengonversi satu PDF ke DOCX, menambahkan daftar link di akhir jika ada,
+    lalu mengembalikan path ke file .docx.
+    """
+    workdir = tempfile.mkdtemp()
+    try:
+        # 1) Baca PDF & nama file asli
+        if hasattr(pdf_file, 'read'):
+            data = pdf_file.read()
+            orig_name = getattr(pdf_file, 'name', 'output.pdf')
+        elif isinstance(pdf_file, dict) and 'name' in pdf_file:
+            with open(pdf_file['name'], 'rb') as f:
+                data = f.read()
+            orig_name = pdf_file['name']
+        elif isinstance(pdf_file, str):
+            with open(pdf_file, 'rb') as f:
+                data = f.read()
+            orig_name = pdf_file
+        else:
+            raise ValueError("Unsupported input type")
+        base_name = os.path.splitext(os.path.basename(orig_name))[0]
+        # 2) Simpan PDF sementara
+        pdf_path = os.path.join(workdir, 'input.pdf')
+        with open(pdf_path, 'wb') as f:
+            f.write(data)
+        # 3) Konversi ke DOCX
+        temp_docx = os.path.join(workdir, 'output.docx')
+        cv = Converter(pdf_path)
+        cv.convert(temp_docx, start=0, end=None)
+        cv.close()
+        # 4) Format semua tabel
+        doc = Document(temp_docx)
+        for table in doc.tables:
+            table.style = 'Table Grid'
+        doc.save(temp_docx)
+        # 5) Ekstrak semua link (annotation + regex)
+        links = []
+        pdf_doc = fitz.open(pdf_path)
+        for page in pdf_doc:
+            for annot in page.annots() or []:
+                uri = annot.info.get('uri')
+                if uri:
+                    u = uri.rstrip('.,;:)]')
+                    if u not in links:
+                        links.append(u)
+            text = page.get_text('text')
+            for m in URL_PATTERN.findall(text):
+                u = m.rstrip('.,;:)]')
+                if u not in links:
+                    links.append(u)
+        pdf_doc.close()
+        # 6) Tambahkan daftar link di DOCX jika ada
+        if links:
+            doc = Document(temp_docx)
+            doc.add_page_break()
+            doc.add_heading('Daftar Link', level=2)
+            for u in links:
+                doc.add_paragraph(u)
+            doc.save(temp_docx)
+        # 7) Salin ke file akhir dengan nama asli
+        final_path = os.path.join(workdir, f"{base_name}.docx")
+        shutil.copy(temp_docx, final_path)
+        return final_path
+    finally:
+        # jangan dihapus agar Gradio masih bisa mengakses file
+        pass
+def convert_and_enable(pdf_files):
+    """
+    Menerima list PDF, memproses semuanya,
+    lalu mengembalikan gr.update() dengan path .docx tunggal
+    atau .zip jika lebih dari satu file.
+    """
+    out_paths = [convert_pdf_to_word(pdf) for pdf in pdf_files]
+    # Jika >1, bungkus ke ZIP
+    if len(out_paths) > 1:
+        zip_dir = tempfile.mkdtemp()
+        zip_path = os.path.join(zip_dir, "converted_docs.zip")
+        with zipfile.ZipFile(zip_path, "w") as zf:
+            for p in out_paths:
+                zf.write(p, arcname=os.path.basename(p))
+        return gr.update(value=zip_path, interactive=True)
+    # Jika hanya satu, langsung .docx
+    return gr.update(value=out_paths[0], interactive=True)
+def reset_download(_):
+    """
+    Reset DownloadButton ke state awal ketika input berubah/clear.
+    """
+    return gr.update(value=None, interactive=False)
+# CSS untuk tombol full-width
+css = """
+#convert-btn, #download-btn {
+    width: 100%;
+}
+"""
+with gr.Blocks(css=css, title="PDF→Word Converter") as demo:
+    gr.Markdown("# PDF→Word Converter 🎉")
+    gr.Markdown("Upload satu atau lebih PDF, lalu tekan Convert untuk mendapatkan DOCX atau ZIP.")
+    # Upload multiple PDFs
+    pdf_inputs = gr.Files(
+        label="Upload PDF(s)",
+        file_types=['.pdf']
+    )
+    # Tombol Convert
+    convert_btn = gr.Button(
+        "Convert",
+        variant="primary",
+        elem_id="convert-btn"
+    )
+    # Tombol Download (disabled hingga ready)
+    download_btn = gr.DownloadButton(
+        label="⬇️ Download Output",
+        value=None,
+        interactive=False,
+        variant="primary",
+        elem_id="download-btn"
+    )
+    # Reset download button setiap kali input berubah atau di-clear
+    pdf_inputs.change(
+        fn=reset_download,
+        inputs=pdf_inputs,
+        outputs=download_btn
+    )
+    # Proses convert saat tombol ditekan, lalu enable download
+    convert_btn.click(
+        fn=convert_and_enable,
+        inputs=pdf_inputs,
+        outputs=download_btn
+    )
+    gr.Markdown("---\nBuilt with ❤️ using Gradio dan PyMuPDF.")
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+pdf2docx
+PyMuPDF
+python-docx