File size: 4,403 Bytes
15a335d
 
 
 
 
 
 
035f4fd
15a335d
 
 
 
68c7f3b
15a335d
 
035f4fd
15a335d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
035f4fd
15a335d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
035f4fd
15a335d
 
 
 
68c7f3b
15a335d
 
035f4fd
15a335d
 
 
 
 
 
 
 
 
 
 
 
 
68c7f3b
15a335d
68c7f3b
15a335d
 
 
68c7f3b
15a335d
 
 
 
68c7f3b
15a335d
 
 
 
68c7f3b
15a335d
 
 
 
68c7f3b
15a335d
 
 
 
 
 
68c7f3b
 
 
 
 
15a335d
68c7f3b
 
15a335d
68c7f3b
 
15a335d
68c7f3b
 
 
 
15a335d
 
 
68c7f3b
15a335d
68c7f3b
15a335d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import fitz  # PyMuPDF
import os
import tempfile
import zipfile

def extract_text(pdf_file):
    pdf_file.seek(0)
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    return text, None

def extract_images(pdf_file):
    pdf_file.seek(0)
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    output_dir = tempfile.mkdtemp()
    img_count = 0
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            try:
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_filename = os.path.join(output_dir, f"image_{page_num+1}_{img_index+1}_{xref}.{image_ext}")
                with open(image_filename, "wb") as image_file:
                    image_file.write(image_bytes)
                img_count += 1
            except Exception:
                continue

    if img_count == 0:
        return "No images found in the PDF.", None

    zip_path = os.path.join(output_dir, "images.zip")
    with zipfile.ZipFile(zip_path, "w") as zipf:
        for fname in os.listdir(output_dir):
            if fname.endswith((".png", ".jpg", ".jpeg")):
                zipf.write(os.path.join(output_dir, fname), fname)

    return f"{img_count} images extracted.", zip_path

def merge_pdfs(pdf_files):
    merged_pdf = fitz.open()
    for pdf_file in pdf_files:
        pdf_file.seek(0)
        with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
            merged_pdf.insert_pdf(doc)
    temp_path = tempfile.mktemp(suffix=".pdf")
    merged_pdf.save(temp_path)
    return "PDFs merged successfully.", temp_path

def split_pdf(pdf_file):
    pdf_file.seek(0)
    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    output_dir = tempfile.mkdtemp()
    for page_num in range(len(doc)):
        new_doc = fitz.open()
        new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
        page_path = os.path.join(output_dir, f"page_{page_num+1}.pdf")
        new_doc.save(page_path)

    zip_path = os.path.join(output_dir, "split_pages.zip")
    with zipfile.ZipFile(zip_path, "w") as zipf:
        for fname in os.listdir(output_dir):
            if fname.endswith(".pdf"):
                zipf.write(os.path.join(output_dir, fname), fname)
    return "PDF split into separate pages.", zip_path

def pdf_tool(task, pdf_input1, pdf_input2):
    if task == "Extract Text":
        if not pdf_input1:
            return "Please upload a PDF file.", None
        return extract_text(pdf_input1)

    elif task == "Extract Images":
        if not pdf_input1:
            return "Please upload a PDF file.", None
        return extract_images(pdf_input1)

    elif task == "Merge PDFs":
        if not pdf_input1 or not pdf_input2:
            return "Please upload two PDF files to merge.", None
        return merge_pdfs([pdf_input1, pdf_input2])

    elif task == "Split PDF":
        if not pdf_input1:
            return "Please upload a PDF file.", None
        return split_pdf(pdf_input1)

    else:
        return "Invalid task selected.", None

with gr.Blocks() as demo:
    gr.Markdown("## 🛠️ PDF Utility Tool")
    task = gr.Radio(
        choices=["Extract Text", "Extract Images", "Merge PDFs", "Split PDF"], 
        label="Select a Task", 
        value="Extract Text"
    )

    pdf_input1 = gr.File(label="PDF File 1", file_types=[".pdf"])
    pdf_input2 = gr.File(label="PDF File 2 (only for Merge)", file_types=[".pdf"], visible=False)

    output_text = gr.Textbox(label="Result / Output", lines=5)
    output_file = gr.File(label="Download File", visible=False)

    def update_file2_visibility(t):
        return gr.update(visible=(t == "Merge PDFs"))

    task.change(update_file2_visibility, inputs=task, outputs=pdf_input2)

    def process(task, pdf_input1, pdf_input2):
        result_text, result_file = pdf_tool(task, pdf_input1, pdf_input2)
        return result_text, gr.update(value=result_file, visible=result_file is not None)

    run_button = gr.Button("Run")
    run_button.click(process, inputs=[task, pdf_input1, pdf_input2], outputs=[output_text, output_file])

demo.launch()