File size: 5,029 Bytes
2147e0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import re
import fitz                         # pip install PyMuPDF
from pdf2docx import Converter     # pip install pdf2docx
from docx import Document          # pip install python-docx
import tempfile, os, shutil, zipfile
import gradio as gr

# Regex pattern untuk URL
URL_PATTERN = re.compile(r'(https?://[^\s<>"\'\)\]]+)')

def convert_pdf_to_word(pdf_file) -> str:
    """
    Mengonversi satu PDF ke DOCX, menambahkan daftar link di akhir jika ada,
    lalu mengembalikan path ke file .docx.
    """
    workdir = tempfile.mkdtemp()
    try:
        # 1) Baca PDF & nama file asli
        if hasattr(pdf_file, 'read'):
            data = pdf_file.read()
            orig_name = getattr(pdf_file, 'name', 'output.pdf')
        elif isinstance(pdf_file, dict) and 'name' in pdf_file:
            with open(pdf_file['name'], 'rb') as f:
                data = f.read()
            orig_name = pdf_file['name']
        elif isinstance(pdf_file, str):
            with open(pdf_file, 'rb') as f:
                data = f.read()
            orig_name = pdf_file
        else:
            raise ValueError("Unsupported input type")
        base_name = os.path.splitext(os.path.basename(orig_name))[0]

        # 2) Simpan PDF sementara
        pdf_path = os.path.join(workdir, 'input.pdf')
        with open(pdf_path, 'wb') as f:
            f.write(data)

        # 3) Konversi ke DOCX
        temp_docx = os.path.join(workdir, 'output.docx')
        cv = Converter(pdf_path)
        cv.convert(temp_docx, start=0, end=None)
        cv.close()

        # 4) Format semua tabel
        doc = Document(temp_docx)
        for table in doc.tables:
            table.style = 'Table Grid'
        doc.save(temp_docx)

        # 5) Ekstrak semua link (annotation + regex)
        links = []
        pdf_doc = fitz.open(pdf_path)
        for page in pdf_doc:
            for annot in page.annots() or []:
                uri = annot.info.get('uri')
                if uri:
                    u = uri.rstrip('.,;:)]')
                    if u not in links:
                        links.append(u)
            text = page.get_text('text')
            for m in URL_PATTERN.findall(text):
                u = m.rstrip('.,;:)]')
                if u not in links:
                    links.append(u)
        pdf_doc.close()

        # 6) Tambahkan daftar link di DOCX jika ada
        if links:
            doc = Document(temp_docx)
            doc.add_page_break()
            doc.add_heading('Daftar Link', level=2)
            for u in links:
                doc.add_paragraph(u)
            doc.save(temp_docx)

        # 7) Salin ke file akhir dengan nama asli
        final_path = os.path.join(workdir, f"{base_name}.docx")
        shutil.copy(temp_docx, final_path)
        return final_path

    finally:
        # jangan dihapus agar Gradio masih bisa mengakses file
        pass

def convert_and_enable(pdf_files):
    """
    Menerima list PDF, memproses semuanya,
    lalu mengembalikan gr.update() dengan path .docx tunggal
    atau .zip jika lebih dari satu file.
    """
    out_paths = [convert_pdf_to_word(pdf) for pdf in pdf_files]
    # Jika >1, bungkus ke ZIP
    if len(out_paths) > 1:
        zip_dir = tempfile.mkdtemp()
        zip_path = os.path.join(zip_dir, "converted_docs.zip")
        with zipfile.ZipFile(zip_path, "w") as zf:
            for p in out_paths:
                zf.write(p, arcname=os.path.basename(p))
        return gr.update(value=zip_path, interactive=True)
    # Jika hanya satu, langsung .docx
    return gr.update(value=out_paths[0], interactive=True)

def reset_download(_):
    """
    Reset DownloadButton ke state awal ketika input berubah/clear.
    """
    return gr.update(value=None, interactive=False)

# CSS untuk tombol full-width
css = """
#convert-btn, #download-btn {
    width: 100%;
}
"""

with gr.Blocks(css=css, title="PDF→Word Converter") as demo:
    gr.Markdown("# PDF→Word Converter 🎉")
    gr.Markdown("Upload satu atau lebih PDF, lalu tekan Convert untuk mendapatkan DOCX atau ZIP.")

    # Upload multiple PDFs
    pdf_inputs = gr.Files(
        label="Upload PDF(s)",
        file_types=['.pdf']
    )

    # Tombol Convert
    convert_btn = gr.Button(
        "Convert",
        variant="primary",
        elem_id="convert-btn"
    )

    # Tombol Download (disabled hingga ready)
    download_btn = gr.DownloadButton(
        label="⬇️ Download Output",
        value=None,
        interactive=False,
        variant="primary",
        elem_id="download-btn"
    )

    # Reset download button setiap kali input berubah atau di-clear
    pdf_inputs.change(
        fn=reset_download,
        inputs=pdf_inputs,
        outputs=download_btn
    )

    # Proses convert saat tombol ditekan, lalu enable download
    convert_btn.click(
        fn=convert_and_enable,
        inputs=pdf_inputs,
        outputs=download_btn
    )

    gr.Markdown("---\nBuilt with ❤️ using Gradio dan PyMuPDF.")

if __name__ == "__main__":
    demo.launch(share=True)