Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import os | |
| import tempfile | |
| import zipfile | |
| def extract_text(pdf_file): | |
| pdf_file.seek(0) | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text, None | |
| def extract_images(pdf_file): | |
| pdf_file.seek(0) | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| output_dir = tempfile.mkdtemp() | |
| img_count = 0 | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| image_list = page.get_images(full=True) | |
| for img_index, img in enumerate(image_list): | |
| xref = img[0] | |
| try: | |
| base_image = doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| image_ext = base_image["ext"] | |
| image_filename = os.path.join(output_dir, f"image_{page_num+1}_{img_index+1}_{xref}.{image_ext}") | |
| with open(image_filename, "wb") as image_file: | |
| image_file.write(image_bytes) | |
| img_count += 1 | |
| except Exception: | |
| continue | |
| if img_count == 0: | |
| return "No images found in the PDF.", None | |
| zip_path = os.path.join(output_dir, "images.zip") | |
| with zipfile.ZipFile(zip_path, "w") as zipf: | |
| for fname in os.listdir(output_dir): | |
| if fname.endswith((".png", ".jpg", ".jpeg")): | |
| zipf.write(os.path.join(output_dir, fname), fname) | |
| return f"{img_count} images extracted.", zip_path | |
| def merge_pdfs(pdf_files): | |
| merged_pdf = fitz.open() | |
| for pdf_file in pdf_files: | |
| pdf_file.seek(0) | |
| with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc: | |
| merged_pdf.insert_pdf(doc) | |
| temp_path = tempfile.mktemp(suffix=".pdf") | |
| merged_pdf.save(temp_path) | |
| return "PDFs merged successfully.", temp_path | |
| def split_pdf(pdf_file): | |
| pdf_file.seek(0) | |
| doc = fitz.open(stream=pdf_file.read(), filetype="pdf") | |
| output_dir = tempfile.mkdtemp() | |
| for page_num in range(len(doc)): | |
| new_doc = fitz.open() | |
| new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num) | |
| page_path = os.path.join(output_dir, f"page_{page_num+1}.pdf") | |
| new_doc.save(page_path) | |
| zip_path = os.path.join(output_dir, "split_pages.zip") | |
| with zipfile.ZipFile(zip_path, "w") as zipf: | |
| for fname in os.listdir(output_dir): | |
| if fname.endswith(".pdf"): | |
| zipf.write(os.path.join(output_dir, fname), fname) | |
| return "PDF split into separate pages.", zip_path | |
| def pdf_tool(task, pdf_input1, pdf_input2): | |
| if task == "Extract Text": | |
| if not pdf_input1: | |
| return "Please upload a PDF file.", None | |
| return extract_text(pdf_input1) | |
| elif task == "Extract Images": | |
| if not pdf_input1: | |
| return "Please upload a PDF file.", None | |
| return extract_images(pdf_input1) | |
| elif task == "Merge PDFs": | |
| if not pdf_input1 or not pdf_input2: | |
| return "Please upload two PDF files to merge.", None | |
| return merge_pdfs([pdf_input1, pdf_input2]) | |
| elif task == "Split PDF": | |
| if not pdf_input1: | |
| return "Please upload a PDF file.", None | |
| return split_pdf(pdf_input1) | |
| else: | |
| return "Invalid task selected.", None | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🛠️ PDF Utility Tool") | |
| task = gr.Radio( | |
| choices=["Extract Text", "Extract Images", "Merge PDFs", "Split PDF"], | |
| label="Select a Task", | |
| value="Extract Text" | |
| ) | |
| pdf_input1 = gr.File(label="PDF File 1", file_types=[".pdf"]) | |
| pdf_input2 = gr.File(label="PDF File 2 (only for Merge)", file_types=[".pdf"], visible=False) | |
| output_text = gr.Textbox(label="Result / Output", lines=5) | |
| output_file = gr.File(label="Download File", visible=False) | |
| def update_file2_visibility(t): | |
| return gr.update(visible=(t == "Merge PDFs")) | |
| task.change(update_file2_visibility, inputs=task, outputs=pdf_input2) | |
| def process(task, pdf_input1, pdf_input2): | |
| result_text, result_file = pdf_tool(task, pdf_input1, pdf_input2) | |
| return result_text, gr.update(value=result_file, visible=result_file is not None) | |
| run_button = gr.Button("Run") | |
| run_button.click(process, inputs=[task, pdf_input1, pdf_input2], outputs=[output_text, output_file]) | |
| demo.launch() |