|
|
import re |
|
|
import fitz |
|
|
from pdf2docx import Converter |
|
|
from docx import Document |
|
|
import tempfile, os, shutil, zipfile |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
URL_PATTERN = re.compile(r'(https?://[^\s<>"\'\)\]]+)') |
|
|
|
|
|
def convert_pdf_to_word(pdf_file) -> str: |
|
|
""" |
|
|
Mengonversi satu PDF ke DOCX, menambahkan daftar link di akhir jika ada, |
|
|
lalu mengembalikan path ke file .docx. |
|
|
""" |
|
|
workdir = tempfile.mkdtemp() |
|
|
try: |
|
|
|
|
|
if hasattr(pdf_file, 'read'): |
|
|
data = pdf_file.read() |
|
|
orig_name = getattr(pdf_file, 'name', 'output.pdf') |
|
|
elif isinstance(pdf_file, dict) and 'name' in pdf_file: |
|
|
with open(pdf_file['name'], 'rb') as f: |
|
|
data = f.read() |
|
|
orig_name = pdf_file['name'] |
|
|
elif isinstance(pdf_file, str): |
|
|
with open(pdf_file, 'rb') as f: |
|
|
data = f.read() |
|
|
orig_name = pdf_file |
|
|
else: |
|
|
raise ValueError("Unsupported input type") |
|
|
base_name = os.path.splitext(os.path.basename(orig_name))[0] |
|
|
|
|
|
|
|
|
pdf_path = os.path.join(workdir, 'input.pdf') |
|
|
with open(pdf_path, 'wb') as f: |
|
|
f.write(data) |
|
|
|
|
|
|
|
|
temp_docx = os.path.join(workdir, 'output.docx') |
|
|
cv = Converter(pdf_path) |
|
|
cv.convert(temp_docx, start=0, end=None) |
|
|
cv.close() |
|
|
|
|
|
|
|
|
doc = Document(temp_docx) |
|
|
for table in doc.tables: |
|
|
table.style = 'Table Grid' |
|
|
doc.save(temp_docx) |
|
|
|
|
|
|
|
|
links = [] |
|
|
pdf_doc = fitz.open(pdf_path) |
|
|
for page in pdf_doc: |
|
|
for annot in page.annots() or []: |
|
|
uri = annot.info.get('uri') |
|
|
if uri: |
|
|
u = uri.rstrip('.,;:)]') |
|
|
if u not in links: |
|
|
links.append(u) |
|
|
text = page.get_text('text') |
|
|
for m in URL_PATTERN.findall(text): |
|
|
u = m.rstrip('.,;:)]') |
|
|
if u not in links: |
|
|
links.append(u) |
|
|
pdf_doc.close() |
|
|
|
|
|
|
|
|
if links: |
|
|
doc = Document(temp_docx) |
|
|
doc.add_page_break() |
|
|
doc.add_heading('Daftar Link', level=2) |
|
|
for u in links: |
|
|
doc.add_paragraph(u) |
|
|
doc.save(temp_docx) |
|
|
|
|
|
|
|
|
final_path = os.path.join(workdir, f"{base_name}.docx") |
|
|
shutil.copy(temp_docx, final_path) |
|
|
return final_path |
|
|
|
|
|
finally: |
|
|
|
|
|
pass |
|
|
|
|
|
def convert_and_enable(pdf_files): |
|
|
""" |
|
|
Menerima list PDF, memproses semuanya, |
|
|
lalu mengembalikan gr.update() dengan path .docx tunggal |
|
|
atau .zip jika lebih dari satu file. |
|
|
""" |
|
|
out_paths = [convert_pdf_to_word(pdf) for pdf in pdf_files] |
|
|
|
|
|
if len(out_paths) > 1: |
|
|
zip_dir = tempfile.mkdtemp() |
|
|
zip_path = os.path.join(zip_dir, "converted_docs.zip") |
|
|
with zipfile.ZipFile(zip_path, "w") as zf: |
|
|
for p in out_paths: |
|
|
zf.write(p, arcname=os.path.basename(p)) |
|
|
return gr.update(value=zip_path, interactive=True) |
|
|
|
|
|
return gr.update(value=out_paths[0], interactive=True) |
|
|
|
|
|
def reset_download(_): |
|
|
""" |
|
|
Reset DownloadButton ke state awal ketika input berubah/clear. |
|
|
""" |
|
|
return gr.update(value=None, interactive=False) |
|
|
|
|
|
|
|
|
css = """ |
|
|
#convert-btn, #download-btn { |
|
|
width: 100%; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=css, title="PDF→Word Converter") as demo: |
|
|
gr.Markdown("# PDF→Word Converter 🎉") |
|
|
gr.Markdown("Upload satu atau lebih PDF, lalu tekan Convert untuk mendapatkan DOCX atau ZIP.") |
|
|
|
|
|
|
|
|
pdf_inputs = gr.Files( |
|
|
label="Upload PDF(s)", |
|
|
file_types=['.pdf'] |
|
|
) |
|
|
|
|
|
|
|
|
convert_btn = gr.Button( |
|
|
"Convert", |
|
|
variant="primary", |
|
|
elem_id="convert-btn" |
|
|
) |
|
|
|
|
|
|
|
|
download_btn = gr.DownloadButton( |
|
|
label="⬇️ Download Output", |
|
|
value=None, |
|
|
interactive=False, |
|
|
variant="primary", |
|
|
elem_id="download-btn" |
|
|
) |
|
|
|
|
|
|
|
|
pdf_inputs.change( |
|
|
fn=reset_download, |
|
|
inputs=pdf_inputs, |
|
|
outputs=download_btn |
|
|
) |
|
|
|
|
|
|
|
|
convert_btn.click( |
|
|
fn=convert_and_enable, |
|
|
inputs=pdf_inputs, |
|
|
outputs=download_btn |
|
|
) |
|
|
|
|
|
gr.Markdown("---\nBuilt with ❤️ using Gradio dan PyMuPDF.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(share=True) |
|
|
|