Spaces:

lokesh341
/

datatoeditable

Sleeping

App Files Files Community

lokesh341 commited on Jan 7, 2025

Commit

e1267a8

verified ·

1 Parent(s): e619894

Update PyMuPDF (fitz)

Browse files

Files changed (1) hide show

PyMuPDF (fitz) +46 -20

PyMuPDF (fitz) CHANGED Viewed

@@ -1,29 +1,55 @@
 import fitz  # PyMuPDF
-def extract_and_recreate_pdf(input_path, output_path):
-    doc = fitz.open(input_path)
-    new_doc = fitz.open()
-    for page in doc:
-        # Extract text
-        text = page.get_text("text")
         # Extract images
         images = page.get_images(full=True)
-        page_rect = page.rect
-        # Create new page
-        new_page = new_doc.new_page(width=page_rect.width, height=page_rect.height)
-        # Add text to the new page
-        new_page.insert_text((10, 10), text, fontsize=12)
-        # Add images to the new page
-        for img in images:
             xref = img[0]
             base_image = doc.extract_image(xref)
             img_bytes = base_image["image"]
-            img_rect = fitz.Rect(50, 50, 300, 300)  # Example position
-            pix = fitz.Pixmap(img_bytes)
-            new_page.insert_image(img_rect, pixmap=pix)
-    new_doc.save(output_path)

 import fitz  # PyMuPDF
+from fpdf import FPDF
+import os
+# Function to extract text and images from the PDF
+def extract_content_from_pdf(pdf_path, output_dir):
+    doc = fitz.open(pdf_path)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        # Save text
+        with open(f"{output_dir}/page_{page_num + 1}.txt", "w", encoding="utf-8") as text_file:
+            text_file.write(page.get_text())
         # Extract images
         images = page.get_images(full=True)
+        for img_index, img in enumerate(images):
             xref = img[0]
             base_image = doc.extract_image(xref)
             img_bytes = base_image["image"]
+            img_ext = base_image["ext"]
+            with open(f"{output_dir}/page_{page_num + 1}_img_{img_index + 1}.{img_ext}", "wb") as img_file:
+                img_file.write(img_bytes)
+    doc.close()
+# Function to recreate the editable PDF
+def recreate_pdf(output_dir, output_pdf_path):
+    pdf = FPDF()
+    for root, _, files in os.walk(output_dir):
+        for file in sorted(files):
+            if file.endswith(".txt"):
+                # Add text
+                page_path = os.path.join(root, file)
+                pdf.add_page()
+                with open(page_path, "r", encoding="utf-8") as text_file:
+                    pdf.set_font("Arial", size=12)
+                    lines = text_file.readlines()
+                    for line in lines:
+                        pdf.cell(0, 10, txt=line.strip(), ln=True)
+            elif file.endswith((".png", ".jpg", ".jpeg")):
+                # Add images
+                img_path = os.path.join(root, file)
+                pdf.image(img_path, x=10, y=pdf.get_y(), w=pdf.w - 20)
+    pdf.output(output_pdf_path)
+# Paths
+input_pdf_path = "Editable_Output (1).pdf"
+output_directory = "output_content"
+final_pdf_path = "Recreated_Editable.pdf"
+# Process
+extract_content_from_pdf(input_pdf_path, output_directory)
+recreate_pdf(output_directory, final_pdf_path)
+print(f"Recreated editable PDF saved at: {final_pdf_path}")