datatoeditable / PyMuPDF
lokesh341's picture
Rename PyMuPDF (fitz) to PyMuPDF
d6d9719 verified
import fitz # PyMuPDF
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import io
def extract_and_recreate_pdf(input_pdf, output_pdf):
# Open the input PDF
doc = fitz.open(input_pdf)
# List to store extracted images
extracted_images = []
output_streams = []
# Extract text and images from each page
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text("text") # Extract text
images = page.get_images(full=True)
# Extract images
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
extracted_images.append(io.BytesIO(image_bytes))
# Store text and image streams for later
output_streams.append((text, extracted_images))
# Generate the new PDF
c = canvas.Canvas(output_pdf, pagesize=letter)
width, height = letter
# Add extracted content to the new PDF
for page_num, (text, images) in enumerate(output_streams):
c.drawString(50, height - 50, f"Page {page_num + 1}") # Page number
c.drawString(50, height - 100, text) # Add text
# Add images
for idx, img_stream in enumerate(images):
img_stream.seek(0)
img = fitz.Pixmap(img_stream)
if img.n >= 4: # Convert CMYK to RGB
img = fitz.Pixmap(fitz.csRGB, img)
img.save(f"temp_image_{idx}.png") # Save as temp PNG
c.drawImage(f"temp_image_{idx}.png", 50, height - 300 - (idx * 200), width=200, height=200)
c.showPage() # Add new page
c.save()
# Specify input and output file paths
input_pdf_path = "input.pdf" # Replace with your uploaded PDF file
output_pdf_path = "output_editable.pdf"
extract_and_recreate_pdf(input_pdf_path, output_pdf_path)
print("Editable PDF created successfully.")