PDFtoDOCX / app.py
salman555's picture
Upload 5 files
2147e0b verified
import re
import fitz # pip install PyMuPDF
from pdf2docx import Converter # pip install pdf2docx
from docx import Document # pip install python-docx
import tempfile, os, shutil, zipfile
import gradio as gr
# Regex pattern untuk URL
URL_PATTERN = re.compile(r'(https?://[^\s<>"\'\)\]]+)')
def convert_pdf_to_word(pdf_file) -> str:
"""
Mengonversi satu PDF ke DOCX, menambahkan daftar link di akhir jika ada,
lalu mengembalikan path ke file .docx.
"""
workdir = tempfile.mkdtemp()
try:
# 1) Baca PDF & nama file asli
if hasattr(pdf_file, 'read'):
data = pdf_file.read()
orig_name = getattr(pdf_file, 'name', 'output.pdf')
elif isinstance(pdf_file, dict) and 'name' in pdf_file:
with open(pdf_file['name'], 'rb') as f:
data = f.read()
orig_name = pdf_file['name']
elif isinstance(pdf_file, str):
with open(pdf_file, 'rb') as f:
data = f.read()
orig_name = pdf_file
else:
raise ValueError("Unsupported input type")
base_name = os.path.splitext(os.path.basename(orig_name))[0]
# 2) Simpan PDF sementara
pdf_path = os.path.join(workdir, 'input.pdf')
with open(pdf_path, 'wb') as f:
f.write(data)
# 3) Konversi ke DOCX
temp_docx = os.path.join(workdir, 'output.docx')
cv = Converter(pdf_path)
cv.convert(temp_docx, start=0, end=None)
cv.close()
# 4) Format semua tabel
doc = Document(temp_docx)
for table in doc.tables:
table.style = 'Table Grid'
doc.save(temp_docx)
# 5) Ekstrak semua link (annotation + regex)
links = []
pdf_doc = fitz.open(pdf_path)
for page in pdf_doc:
for annot in page.annots() or []:
uri = annot.info.get('uri')
if uri:
u = uri.rstrip('.,;:)]')
if u not in links:
links.append(u)
text = page.get_text('text')
for m in URL_PATTERN.findall(text):
u = m.rstrip('.,;:)]')
if u not in links:
links.append(u)
pdf_doc.close()
# 6) Tambahkan daftar link di DOCX jika ada
if links:
doc = Document(temp_docx)
doc.add_page_break()
doc.add_heading('Daftar Link', level=2)
for u in links:
doc.add_paragraph(u)
doc.save(temp_docx)
# 7) Salin ke file akhir dengan nama asli
final_path = os.path.join(workdir, f"{base_name}.docx")
shutil.copy(temp_docx, final_path)
return final_path
finally:
# jangan dihapus agar Gradio masih bisa mengakses file
pass
def convert_and_enable(pdf_files):
"""
Menerima list PDF, memproses semuanya,
lalu mengembalikan gr.update() dengan path .docx tunggal
atau .zip jika lebih dari satu file.
"""
out_paths = [convert_pdf_to_word(pdf) for pdf in pdf_files]
# Jika >1, bungkus ke ZIP
if len(out_paths) > 1:
zip_dir = tempfile.mkdtemp()
zip_path = os.path.join(zip_dir, "converted_docs.zip")
with zipfile.ZipFile(zip_path, "w") as zf:
for p in out_paths:
zf.write(p, arcname=os.path.basename(p))
return gr.update(value=zip_path, interactive=True)
# Jika hanya satu, langsung .docx
return gr.update(value=out_paths[0], interactive=True)
def reset_download(_):
"""
Reset DownloadButton ke state awal ketika input berubah/clear.
"""
return gr.update(value=None, interactive=False)
# CSS untuk tombol full-width
css = """
#convert-btn, #download-btn {
width: 100%;
}
"""
with gr.Blocks(css=css, title="PDF→Word Converter") as demo:
gr.Markdown("# PDF→Word Converter 🎉")
gr.Markdown("Upload satu atau lebih PDF, lalu tekan Convert untuk mendapatkan DOCX atau ZIP.")
# Upload multiple PDFs
pdf_inputs = gr.Files(
label="Upload PDF(s)",
file_types=['.pdf']
)
# Tombol Convert
convert_btn = gr.Button(
"Convert",
variant="primary",
elem_id="convert-btn"
)
# Tombol Download (disabled hingga ready)
download_btn = gr.DownloadButton(
label="⬇️ Download Output",
value=None,
interactive=False,
variant="primary",
elem_id="download-btn"
)
# Reset download button setiap kali input berubah atau di-clear
pdf_inputs.change(
fn=reset_download,
inputs=pdf_inputs,
outputs=download_btn
)
# Proses convert saat tombol ditekan, lalu enable download
convert_btn.click(
fn=convert_and_enable,
inputs=pdf_inputs,
outputs=download_btn
)
gr.Markdown("---\nBuilt with ❤️ using Gradio dan PyMuPDF.")
if __name__ == "__main__":
demo.launch(share=True)