Spaces:
Sleeping
Sleeping
| import fitz # PyMuPDF | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.pagesizes import letter | |
| import io | |
| def extract_and_recreate_pdf(input_pdf, output_pdf): | |
| # Open the input PDF | |
| doc = fitz.open(input_pdf) | |
| # List to store extracted images | |
| extracted_images = [] | |
| output_streams = [] | |
| # Extract text and images from each page | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| text = page.get_text("text") # Extract text | |
| images = page.get_images(full=True) | |
| # Extract images | |
| for img_index, img in enumerate(images): | |
| xref = img[0] | |
| base_image = doc.extract_image(xref) | |
| image_bytes = base_image["image"] | |
| extracted_images.append(io.BytesIO(image_bytes)) | |
| # Store text and image streams for later | |
| output_streams.append((text, extracted_images)) | |
| # Generate the new PDF | |
| c = canvas.Canvas(output_pdf, pagesize=letter) | |
| width, height = letter | |
| # Add extracted content to the new PDF | |
| for page_num, (text, images) in enumerate(output_streams): | |
| c.drawString(50, height - 50, f"Page {page_num + 1}") # Page number | |
| c.drawString(50, height - 100, text) # Add text | |
| # Add images | |
| for idx, img_stream in enumerate(images): | |
| img_stream.seek(0) | |
| img = fitz.Pixmap(img_stream) | |
| if img.n >= 4: # Convert CMYK to RGB | |
| img = fitz.Pixmap(fitz.csRGB, img) | |
| img.save(f"temp_image_{idx}.png") # Save as temp PNG | |
| c.drawImage(f"temp_image_{idx}.png", 50, height - 300 - (idx * 200), width=200, height=200) | |
| c.showPage() # Add new page | |
| c.save() | |
| # Specify input and output file paths | |
| input_pdf_path = "input.pdf" # Replace with your uploaded PDF file | |
| output_pdf_path = "output_editable.pdf" | |
| extract_and_recreate_pdf(input_pdf_path, output_pdf_path) | |
| print("Editable PDF created successfully.") | |