Spaces:
Sleeping
Sleeping
File size: 4,403 Bytes
15a335d 035f4fd 15a335d 68c7f3b 15a335d 035f4fd 15a335d 035f4fd 15a335d 035f4fd 15a335d 68c7f3b 15a335d 035f4fd 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d 68c7f3b 15a335d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import fitz # PyMuPDF
import os
import tempfile
import zipfile
def extract_text(pdf_file):
pdf_file.seek(0)
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
return text, None
def extract_images(pdf_file):
pdf_file.seek(0)
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
output_dir = tempfile.mkdtemp()
img_count = 0
for page_num in range(len(doc)):
page = doc.load_page(page_num)
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
try:
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image_filename = os.path.join(output_dir, f"image_{page_num+1}_{img_index+1}_{xref}.{image_ext}")
with open(image_filename, "wb") as image_file:
image_file.write(image_bytes)
img_count += 1
except Exception:
continue
if img_count == 0:
return "No images found in the PDF.", None
zip_path = os.path.join(output_dir, "images.zip")
with zipfile.ZipFile(zip_path, "w") as zipf:
for fname in os.listdir(output_dir):
if fname.endswith((".png", ".jpg", ".jpeg")):
zipf.write(os.path.join(output_dir, fname), fname)
return f"{img_count} images extracted.", zip_path
def merge_pdfs(pdf_files):
merged_pdf = fitz.open()
for pdf_file in pdf_files:
pdf_file.seek(0)
with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
merged_pdf.insert_pdf(doc)
temp_path = tempfile.mktemp(suffix=".pdf")
merged_pdf.save(temp_path)
return "PDFs merged successfully.", temp_path
def split_pdf(pdf_file):
pdf_file.seek(0)
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
output_dir = tempfile.mkdtemp()
for page_num in range(len(doc)):
new_doc = fitz.open()
new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)
page_path = os.path.join(output_dir, f"page_{page_num+1}.pdf")
new_doc.save(page_path)
zip_path = os.path.join(output_dir, "split_pages.zip")
with zipfile.ZipFile(zip_path, "w") as zipf:
for fname in os.listdir(output_dir):
if fname.endswith(".pdf"):
zipf.write(os.path.join(output_dir, fname), fname)
return "PDF split into separate pages.", zip_path
def pdf_tool(task, pdf_input1, pdf_input2):
if task == "Extract Text":
if not pdf_input1:
return "Please upload a PDF file.", None
return extract_text(pdf_input1)
elif task == "Extract Images":
if not pdf_input1:
return "Please upload a PDF file.", None
return extract_images(pdf_input1)
elif task == "Merge PDFs":
if not pdf_input1 or not pdf_input2:
return "Please upload two PDF files to merge.", None
return merge_pdfs([pdf_input1, pdf_input2])
elif task == "Split PDF":
if not pdf_input1:
return "Please upload a PDF file.", None
return split_pdf(pdf_input1)
else:
return "Invalid task selected.", None
with gr.Blocks() as demo:
gr.Markdown("## 🛠️ PDF Utility Tool")
task = gr.Radio(
choices=["Extract Text", "Extract Images", "Merge PDFs", "Split PDF"],
label="Select a Task",
value="Extract Text"
)
pdf_input1 = gr.File(label="PDF File 1", file_types=[".pdf"])
pdf_input2 = gr.File(label="PDF File 2 (only for Merge)", file_types=[".pdf"], visible=False)
output_text = gr.Textbox(label="Result / Output", lines=5)
output_file = gr.File(label="Download File", visible=False)
def update_file2_visibility(t):
return gr.update(visible=(t == "Merge PDFs"))
task.change(update_file2_visibility, inputs=task, outputs=pdf_input2)
def process(task, pdf_input1, pdf_input2):
result_text, result_file = pdf_tool(task, pdf_input1, pdf_input2)
return result_text, gr.update(value=result_file, visible=result_file is not None)
run_button = gr.Button("Run")
run_button.click(process, inputs=[task, pdf_input1, pdf_input2], outputs=[output_text, output_file])
demo.launch() |