Spaces:

lokesh341
/

datatoeditable

Sleeping

App Files Files Community

lokesh341 commited on Jan 7, 2025

Commit

d6d9719

verified ·

1 Parent(s): e1267a8

Rename PyMuPDF (fitz) to PyMuPDF

Browse files

Files changed (2) hide show

PyMuPDF +56 -0
PyMuPDF (fitz) +0 -55

PyMuPDF ADDED Viewed

	@@ -0,0 +1,56 @@

+import fitz  # PyMuPDF
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter
+import io
+def extract_and_recreate_pdf(input_pdf, output_pdf):
+    # Open the input PDF
+    doc = fitz.open(input_pdf)
+    # List to store extracted images
+    extracted_images = []
+    output_streams = []
+    # Extract text and images from each page
+    for page_num in range(len(doc)):
+        page = doc[page_num]
+        text = page.get_text("text")  # Extract text
+        images = page.get_images(full=True)
+        # Extract images
+        for img_index, img in enumerate(images):
+            xref = img[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            extracted_images.append(io.BytesIO(image_bytes))
+        # Store text and image streams for later
+        output_streams.append((text, extracted_images))
+    # Generate the new PDF
+    c = canvas.Canvas(output_pdf, pagesize=letter)
+    width, height = letter
+    # Add extracted content to the new PDF
+    for page_num, (text, images) in enumerate(output_streams):
+        c.drawString(50, height - 50, f"Page {page_num + 1}")  # Page number
+        c.drawString(50, height - 100, text)  # Add text
+        # Add images
+        for idx, img_stream in enumerate(images):
+            img_stream.seek(0)
+            img = fitz.Pixmap(img_stream)
+            if img.n >= 4:  # Convert CMYK to RGB
+                img = fitz.Pixmap(fitz.csRGB, img)
+            img.save(f"temp_image_{idx}.png")  # Save as temp PNG
+            c.drawImage(f"temp_image_{idx}.png", 50, height - 300 - (idx * 200), width=200, height=200)
+        c.showPage()  # Add new page
+    c.save()
+# Specify input and output file paths
+input_pdf_path = "input.pdf"  # Replace with your uploaded PDF file
+output_pdf_path = "output_editable.pdf"
+extract_and_recreate_pdf(input_pdf_path, output_pdf_path)
+print("Editable PDF created successfully.")

PyMuPDF (fitz) DELETED Viewed

@@ -1,55 +0,0 @@
-import fitz  # PyMuPDF
-from fpdf import FPDF
-import os
-# Function to extract text and images from the PDF
-def extract_content_from_pdf(pdf_path, output_dir):
-    doc = fitz.open(pdf_path)
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        # Save text
-        with open(f"{output_dir}/page_{page_num + 1}.txt", "w", encoding="utf-8") as text_file:
-            text_file.write(page.get_text())
-        # Extract images
-        images = page.get_images(full=True)
-        for img_index, img in enumerate(images):
-            xref = img[0]
-            base_image = doc.extract_image(xref)
-            img_bytes = base_image["image"]
-            img_ext = base_image["ext"]
-            with open(f"{output_dir}/page_{page_num + 1}_img_{img_index + 1}.{img_ext}", "wb") as img_file:
-                img_file.write(img_bytes)
-    doc.close()
-# Function to recreate the editable PDF
-def recreate_pdf(output_dir, output_pdf_path):
-    pdf = FPDF()
-    for root, _, files in os.walk(output_dir):
-        for file in sorted(files):
-            if file.endswith(".txt"):
-                # Add text
-                page_path = os.path.join(root, file)
-                pdf.add_page()
-                with open(page_path, "r", encoding="utf-8") as text_file:
-                    pdf.set_font("Arial", size=12)
-                    lines = text_file.readlines()
-                    for line in lines:
-                        pdf.cell(0, 10, txt=line.strip(), ln=True)
-            elif file.endswith((".png", ".jpg", ".jpeg")):
-                # Add images
-                img_path = os.path.join(root, file)
-                pdf.image(img_path, x=10, y=pdf.get_y(), w=pdf.w - 20)
-    pdf.output(output_pdf_path)
-# Paths
-input_pdf_path = "Editable_Output (1).pdf"
-output_directory = "output_content"
-final_pdf_path = "Recreated_Editable.pdf"
-# Process
-extract_content_from_pdf(input_pdf_path, output_directory)
-recreate_pdf(output_directory, final_pdf_path)
-print(f"Recreated editable PDF saved at: {final_pdf_path}")