import gradio as gr import fitz # PyMuPDF import os import tempfile import zipfile def extract_text(pdf_file): pdf_file.seek(0) doc = fitz.open(stream=pdf_file.read(), filetype="pdf") text = "" for page in doc: text += page.get_text() return text, None def extract_images(pdf_file): pdf_file.seek(0) doc = fitz.open(stream=pdf_file.read(), filetype="pdf") output_dir = tempfile.mkdtemp() img_count = 0 for page_num in range(len(doc)): page = doc.load_page(page_num) image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] try: base_image = doc.extract_image(xref) image_bytes = base_image["image"] image_ext = base_image["ext"] image_filename = os.path.join(output_dir, f"image_{page_num+1}_{img_index+1}_{xref}.{image_ext}") with open(image_filename, "wb") as image_file: image_file.write(image_bytes) img_count += 1 except Exception: continue if img_count == 0: return "No images found in the PDF.", None zip_path = os.path.join(output_dir, "images.zip") with zipfile.ZipFile(zip_path, "w") as zipf: for fname in os.listdir(output_dir): if fname.endswith((".png", ".jpg", ".jpeg")): zipf.write(os.path.join(output_dir, fname), fname) return f"{img_count} images extracted.", zip_path def merge_pdfs(pdf_files): merged_pdf = fitz.open() for pdf_file in pdf_files: pdf_file.seek(0) with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc: merged_pdf.insert_pdf(doc) temp_path = tempfile.mktemp(suffix=".pdf") merged_pdf.save(temp_path) return "PDFs merged successfully.", temp_path def split_pdf(pdf_file): pdf_file.seek(0) doc = fitz.open(stream=pdf_file.read(), filetype="pdf") output_dir = tempfile.mkdtemp() for page_num in range(len(doc)): new_doc = fitz.open() new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) page_path = os.path.join(output_dir, f"page_{page_num+1}.pdf") new_doc.save(page_path) zip_path = os.path.join(output_dir, "split_pages.zip") with zipfile.ZipFile(zip_path, "w") as zipf: for fname in os.listdir(output_dir): if fname.endswith(".pdf"): zipf.write(os.path.join(output_dir, fname), fname) return "PDF split into separate pages.", zip_path def pdf_tool(task, pdf_input1, pdf_input2): if task == "Extract Text": if not pdf_input1: return "Please upload a PDF file.", None return extract_text(pdf_input1) elif task == "Extract Images": if not pdf_input1: return "Please upload a PDF file.", None return extract_images(pdf_input1) elif task == "Merge PDFs": if not pdf_input1 or not pdf_input2: return "Please upload two PDF files to merge.", None return merge_pdfs([pdf_input1, pdf_input2]) elif task == "Split PDF": if not pdf_input1: return "Please upload a PDF file.", None return split_pdf(pdf_input1) else: return "Invalid task selected.", None with gr.Blocks() as demo: gr.Markdown("## 🛠️ PDF Utility Tool") task = gr.Radio( choices=["Extract Text", "Extract Images", "Merge PDFs", "Split PDF"], label="Select a Task", value="Extract Text" ) pdf_input1 = gr.File(label="PDF File 1", file_types=[".pdf"]) pdf_input2 = gr.File(label="PDF File 2 (only for Merge)", file_types=[".pdf"], visible=False) output_text = gr.Textbox(label="Result / Output", lines=5) output_file = gr.File(label="Download File", visible=False) def update_file2_visibility(t): return gr.update(visible=(t == "Merge PDFs")) task.change(update_file2_visibility, inputs=task, outputs=pdf_input2) def process(task, pdf_input1, pdf_input2): result_text, result_file = pdf_tool(task, pdf_input1, pdf_input2) return result_text, gr.update(value=result_file, visible=result_file is not None) run_button = gr.Button("Run") run_button.click(process, inputs=[task, pdf_input1, pdf_input2], outputs=[output_text, output_file]) demo.launch()