salman555 commited on
Commit
2147e0b
·
verified ·
1 Parent(s): cac96e4

Upload 5 files

Browse files
Files changed (5) hide show
  1. .gitignore +27 -0
  2. LICENSE +21 -0
  3. README.md +88 -0
  4. app.py +160 -0
  5. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environments
7
+ .env/
8
+ .venv/
9
+
10
+ # Distribution / packaging
11
+ build/
12
+ dist/
13
+ *.egg-info/
14
+
15
+ # IDE/editor
16
+ .vscode/
17
+ .idea/
18
+
19
+ # OS files
20
+ .DS_Store
21
+ Thumbs.db
22
+
23
+ # Generated docs / outputs
24
+ *.zip
25
+ *.docx
26
+ *.pdf
27
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 [Salman Alfarisi]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ [...standard MIT text continues...]
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20
+ SOFTWARE.
21
+
README.md ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PDFtoDOCX
3
+ emoji: 👀
4
+ colorFrom: pink
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # PDF→Word Converter
14
+
15
+ A simple Gradio‐based web app to convert one or more PDF files into DOCX,
16
+ format all tables as **Table Grid**, extract any URLs and append them at the end,
17
+ and—if you upload multiple PDFs—bundle all outputs into a ZIP.
18
+
19
+ ---
20
+
21
+ ## 🚀 Features
22
+
23
+ - **Batch or single** PDF → DOCX conversion
24
+ - Automatically style all tables as **Table Grid**
25
+ - Extracts both annotation and inline URLs into a “Daftar Link” section
26
+ - If multiple PDFs are uploaded, outputs are packaged into a ZIP
27
+
28
+ ---
29
+
30
+ ## 📦 Installation
31
+
32
+ 1. **Clone this repo**
33
+ ```bash
34
+ git clone https://github.com/salmanalfarisi11/pdf-to-docx.git
35
+ cd pdf2word-gradio
36
+ ```
37
+
38
+ 2. Create and activate a virtual environment:
39
+
40
+ ```bash
41
+ python -m venv .venv
42
+ source .venv/bin/activate # Linux/macOS
43
+ .venv\Scripts\activate # Windows
44
+ ```
45
+
46
+ 3. Install dependencies:
47
+
48
+ ```bash
49
+ pip install -r requirements.txt
50
+ ```
51
+
52
+ ## 🚀 Running Locally
53
+
54
+ Launch the app on your machine:
55
+ ```bash
56
+ python app.py
57
+ ```
58
+ By default, it will start on http://127.0.0.1:7860/. Open that URL in your browser to access the interface.
59
+
60
+ ## 🎯 Usage
61
+
62
+ 1. Open the localhost URL shown in your terminal.
63
+ 2. Drag & drop one or more PDF files into the Upload PDF(s) panel.
64
+ 3. Click Convert.
65
+ 4. When ready, click ⬇️ Download Output to download either a single DOCX or a ZIP of all DOCXs.
66
+
67
+ ## 🛠️ Dependencies
68
+
69
+ - Python 3.8+
70
+ - Gradio ≥ 5.33.0
71
+ - PyMuPDF
72
+ - pdf2docx
73
+ - python-docx
74
+
75
+
76
+ ## 📄 License
77
+
78
+ This project is licensed under the [MIT License](LICENSE).
79
+
80
+ ---
81
+
82
+
83
+ ## 🖋️ Author & Credits
84
+
85
+ Developed by **[Salman Alfarisi](https://github.com/salmanalfarisi11)** © 2025
86
+ - GitHub: [salmanalfarisi11](https://github.com/salmanalfarisi11)
87
+ - LinkedIn: [salmanalfarisi11](https://linkedin.com/in/salmanalfarisi11)
88
+ - Instagram: [faris.salman111](https://instagram.com/faris.salman111)
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz # pip install PyMuPDF
3
+ from pdf2docx import Converter # pip install pdf2docx
4
+ from docx import Document # pip install python-docx
5
+ import tempfile, os, shutil, zipfile
6
+ import gradio as gr
7
+
8
+ # Regex pattern untuk URL
9
+ URL_PATTERN = re.compile(r'(https?://[^\s<>"\'\)\]]+)')
10
+
11
+ def convert_pdf_to_word(pdf_file) -> str:
12
+ """
13
+ Mengonversi satu PDF ke DOCX, menambahkan daftar link di akhir jika ada,
14
+ lalu mengembalikan path ke file .docx.
15
+ """
16
+ workdir = tempfile.mkdtemp()
17
+ try:
18
+ # 1) Baca PDF & nama file asli
19
+ if hasattr(pdf_file, 'read'):
20
+ data = pdf_file.read()
21
+ orig_name = getattr(pdf_file, 'name', 'output.pdf')
22
+ elif isinstance(pdf_file, dict) and 'name' in pdf_file:
23
+ with open(pdf_file['name'], 'rb') as f:
24
+ data = f.read()
25
+ orig_name = pdf_file['name']
26
+ elif isinstance(pdf_file, str):
27
+ with open(pdf_file, 'rb') as f:
28
+ data = f.read()
29
+ orig_name = pdf_file
30
+ else:
31
+ raise ValueError("Unsupported input type")
32
+ base_name = os.path.splitext(os.path.basename(orig_name))[0]
33
+
34
+ # 2) Simpan PDF sementara
35
+ pdf_path = os.path.join(workdir, 'input.pdf')
36
+ with open(pdf_path, 'wb') as f:
37
+ f.write(data)
38
+
39
+ # 3) Konversi ke DOCX
40
+ temp_docx = os.path.join(workdir, 'output.docx')
41
+ cv = Converter(pdf_path)
42
+ cv.convert(temp_docx, start=0, end=None)
43
+ cv.close()
44
+
45
+ # 4) Format semua tabel
46
+ doc = Document(temp_docx)
47
+ for table in doc.tables:
48
+ table.style = 'Table Grid'
49
+ doc.save(temp_docx)
50
+
51
+ # 5) Ekstrak semua link (annotation + regex)
52
+ links = []
53
+ pdf_doc = fitz.open(pdf_path)
54
+ for page in pdf_doc:
55
+ for annot in page.annots() or []:
56
+ uri = annot.info.get('uri')
57
+ if uri:
58
+ u = uri.rstrip('.,;:)]')
59
+ if u not in links:
60
+ links.append(u)
61
+ text = page.get_text('text')
62
+ for m in URL_PATTERN.findall(text):
63
+ u = m.rstrip('.,;:)]')
64
+ if u not in links:
65
+ links.append(u)
66
+ pdf_doc.close()
67
+
68
+ # 6) Tambahkan daftar link di DOCX jika ada
69
+ if links:
70
+ doc = Document(temp_docx)
71
+ doc.add_page_break()
72
+ doc.add_heading('Daftar Link', level=2)
73
+ for u in links:
74
+ doc.add_paragraph(u)
75
+ doc.save(temp_docx)
76
+
77
+ # 7) Salin ke file akhir dengan nama asli
78
+ final_path = os.path.join(workdir, f"{base_name}.docx")
79
+ shutil.copy(temp_docx, final_path)
80
+ return final_path
81
+
82
+ finally:
83
+ # jangan dihapus agar Gradio masih bisa mengakses file
84
+ pass
85
+
86
+ def convert_and_enable(pdf_files):
87
+ """
88
+ Menerima list PDF, memproses semuanya,
89
+ lalu mengembalikan gr.update() dengan path .docx tunggal
90
+ atau .zip jika lebih dari satu file.
91
+ """
92
+ out_paths = [convert_pdf_to_word(pdf) for pdf in pdf_files]
93
+ # Jika >1, bungkus ke ZIP
94
+ if len(out_paths) > 1:
95
+ zip_dir = tempfile.mkdtemp()
96
+ zip_path = os.path.join(zip_dir, "converted_docs.zip")
97
+ with zipfile.ZipFile(zip_path, "w") as zf:
98
+ for p in out_paths:
99
+ zf.write(p, arcname=os.path.basename(p))
100
+ return gr.update(value=zip_path, interactive=True)
101
+ # Jika hanya satu, langsung .docx
102
+ return gr.update(value=out_paths[0], interactive=True)
103
+
104
+ def reset_download(_):
105
+ """
106
+ Reset DownloadButton ke state awal ketika input berubah/clear.
107
+ """
108
+ return gr.update(value=None, interactive=False)
109
+
110
+ # CSS untuk tombol full-width
111
+ css = """
112
+ #convert-btn, #download-btn {
113
+ width: 100%;
114
+ }
115
+ """
116
+
117
+ with gr.Blocks(css=css, title="PDF→Word Converter") as demo:
118
+ gr.Markdown("# PDF→Word Converter 🎉")
119
+ gr.Markdown("Upload satu atau lebih PDF, lalu tekan Convert untuk mendapatkan DOCX atau ZIP.")
120
+
121
+ # Upload multiple PDFs
122
+ pdf_inputs = gr.Files(
123
+ label="Upload PDF(s)",
124
+ file_types=['.pdf']
125
+ )
126
+
127
+ # Tombol Convert
128
+ convert_btn = gr.Button(
129
+ "Convert",
130
+ variant="primary",
131
+ elem_id="convert-btn"
132
+ )
133
+
134
+ # Tombol Download (disabled hingga ready)
135
+ download_btn = gr.DownloadButton(
136
+ label="⬇️ Download Output",
137
+ value=None,
138
+ interactive=False,
139
+ variant="primary",
140
+ elem_id="download-btn"
141
+ )
142
+
143
+ # Reset download button setiap kali input berubah atau di-clear
144
+ pdf_inputs.change(
145
+ fn=reset_download,
146
+ inputs=pdf_inputs,
147
+ outputs=download_btn
148
+ )
149
+
150
+ # Proses convert saat tombol ditekan, lalu enable download
151
+ convert_btn.click(
152
+ fn=convert_and_enable,
153
+ inputs=pdf_inputs,
154
+ outputs=download_btn
155
+ )
156
+
157
+ gr.Markdown("---\nBuilt with ❤️ using Gradio dan PyMuPDF.")
158
+
159
+ if __name__ == "__main__":
160
+ demo.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ pdf2docx
3
+ PyMuPDF
4
+ python-docx
5
+