Spaces:
Sleeping
Sleeping
Rename PyMuPDF (fitz) to PyMuPDF
Browse files- PyMuPDF +56 -0
- PyMuPDF (fitz) +0 -55
PyMuPDF
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
from reportlab.pdfgen import canvas
|
| 3 |
+
from reportlab.lib.pagesizes import letter
|
| 4 |
+
import io
|
| 5 |
+
|
| 6 |
+
def extract_and_recreate_pdf(input_pdf, output_pdf):
|
| 7 |
+
# Open the input PDF
|
| 8 |
+
doc = fitz.open(input_pdf)
|
| 9 |
+
|
| 10 |
+
# List to store extracted images
|
| 11 |
+
extracted_images = []
|
| 12 |
+
output_streams = []
|
| 13 |
+
|
| 14 |
+
# Extract text and images from each page
|
| 15 |
+
for page_num in range(len(doc)):
|
| 16 |
+
page = doc[page_num]
|
| 17 |
+
text = page.get_text("text") # Extract text
|
| 18 |
+
images = page.get_images(full=True)
|
| 19 |
+
|
| 20 |
+
# Extract images
|
| 21 |
+
for img_index, img in enumerate(images):
|
| 22 |
+
xref = img[0]
|
| 23 |
+
base_image = doc.extract_image(xref)
|
| 24 |
+
image_bytes = base_image["image"]
|
| 25 |
+
extracted_images.append(io.BytesIO(image_bytes))
|
| 26 |
+
|
| 27 |
+
# Store text and image streams for later
|
| 28 |
+
output_streams.append((text, extracted_images))
|
| 29 |
+
|
| 30 |
+
# Generate the new PDF
|
| 31 |
+
c = canvas.Canvas(output_pdf, pagesize=letter)
|
| 32 |
+
width, height = letter
|
| 33 |
+
|
| 34 |
+
# Add extracted content to the new PDF
|
| 35 |
+
for page_num, (text, images) in enumerate(output_streams):
|
| 36 |
+
c.drawString(50, height - 50, f"Page {page_num + 1}") # Page number
|
| 37 |
+
c.drawString(50, height - 100, text) # Add text
|
| 38 |
+
|
| 39 |
+
# Add images
|
| 40 |
+
for idx, img_stream in enumerate(images):
|
| 41 |
+
img_stream.seek(0)
|
| 42 |
+
img = fitz.Pixmap(img_stream)
|
| 43 |
+
if img.n >= 4: # Convert CMYK to RGB
|
| 44 |
+
img = fitz.Pixmap(fitz.csRGB, img)
|
| 45 |
+
img.save(f"temp_image_{idx}.png") # Save as temp PNG
|
| 46 |
+
c.drawImage(f"temp_image_{idx}.png", 50, height - 300 - (idx * 200), width=200, height=200)
|
| 47 |
+
|
| 48 |
+
c.showPage() # Add new page
|
| 49 |
+
|
| 50 |
+
c.save()
|
| 51 |
+
|
| 52 |
+
# Specify input and output file paths
|
| 53 |
+
input_pdf_path = "input.pdf" # Replace with your uploaded PDF file
|
| 54 |
+
output_pdf_path = "output_editable.pdf"
|
| 55 |
+
extract_and_recreate_pdf(input_pdf_path, output_pdf_path)
|
| 56 |
+
print("Editable PDF created successfully.")
|
PyMuPDF (fitz)
DELETED
|
@@ -1,55 +0,0 @@
|
|
| 1 |
-
import fitz # PyMuPDF
|
| 2 |
-
from fpdf import FPDF
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
# Function to extract text and images from the PDF
|
| 6 |
-
def extract_content_from_pdf(pdf_path, output_dir):
|
| 7 |
-
doc = fitz.open(pdf_path)
|
| 8 |
-
if not os.path.exists(output_dir):
|
| 9 |
-
os.makedirs(output_dir)
|
| 10 |
-
|
| 11 |
-
for page_num in range(len(doc)):
|
| 12 |
-
page = doc.load_page(page_num)
|
| 13 |
-
# Save text
|
| 14 |
-
with open(f"{output_dir}/page_{page_num + 1}.txt", "w", encoding="utf-8") as text_file:
|
| 15 |
-
text_file.write(page.get_text())
|
| 16 |
-
# Extract images
|
| 17 |
-
images = page.get_images(full=True)
|
| 18 |
-
for img_index, img in enumerate(images):
|
| 19 |
-
xref = img[0]
|
| 20 |
-
base_image = doc.extract_image(xref)
|
| 21 |
-
img_bytes = base_image["image"]
|
| 22 |
-
img_ext = base_image["ext"]
|
| 23 |
-
with open(f"{output_dir}/page_{page_num + 1}_img_{img_index + 1}.{img_ext}", "wb") as img_file:
|
| 24 |
-
img_file.write(img_bytes)
|
| 25 |
-
doc.close()
|
| 26 |
-
|
| 27 |
-
# Function to recreate the editable PDF
|
| 28 |
-
def recreate_pdf(output_dir, output_pdf_path):
|
| 29 |
-
pdf = FPDF()
|
| 30 |
-
for root, _, files in os.walk(output_dir):
|
| 31 |
-
for file in sorted(files):
|
| 32 |
-
if file.endswith(".txt"):
|
| 33 |
-
# Add text
|
| 34 |
-
page_path = os.path.join(root, file)
|
| 35 |
-
pdf.add_page()
|
| 36 |
-
with open(page_path, "r", encoding="utf-8") as text_file:
|
| 37 |
-
pdf.set_font("Arial", size=12)
|
| 38 |
-
lines = text_file.readlines()
|
| 39 |
-
for line in lines:
|
| 40 |
-
pdf.cell(0, 10, txt=line.strip(), ln=True)
|
| 41 |
-
elif file.endswith((".png", ".jpg", ".jpeg")):
|
| 42 |
-
# Add images
|
| 43 |
-
img_path = os.path.join(root, file)
|
| 44 |
-
pdf.image(img_path, x=10, y=pdf.get_y(), w=pdf.w - 20)
|
| 45 |
-
pdf.output(output_pdf_path)
|
| 46 |
-
|
| 47 |
-
# Paths
|
| 48 |
-
input_pdf_path = "Editable_Output (1).pdf"
|
| 49 |
-
output_directory = "output_content"
|
| 50 |
-
final_pdf_path = "Recreated_Editable.pdf"
|
| 51 |
-
|
| 52 |
-
# Process
|
| 53 |
-
extract_content_from_pdf(input_pdf_path, output_directory)
|
| 54 |
-
recreate_pdf(output_directory, final_pdf_path)
|
| 55 |
-
print(f"Recreated editable PDF saved at: {final_pdf_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|