Spaces:
Sleeping
Sleeping
Update PyMuPDF (fitz)
Browse files- PyMuPDF (fitz) +46 -20
PyMuPDF (fitz)
CHANGED
|
@@ -1,29 +1,55 @@
|
|
| 1 |
import fitz # PyMuPDF
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
for
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
# Extract images
|
| 11 |
images = page.get_images(full=True)
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
# Create new page
|
| 15 |
-
new_page = new_doc.new_page(width=page_rect.width, height=page_rect.height)
|
| 16 |
-
|
| 17 |
-
# Add text to the new page
|
| 18 |
-
new_page.insert_text((10, 10), text, fontsize=12)
|
| 19 |
-
|
| 20 |
-
# Add images to the new page
|
| 21 |
-
for img in images:
|
| 22 |
xref = img[0]
|
| 23 |
base_image = doc.extract_image(xref)
|
| 24 |
img_bytes = base_image["image"]
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import fitz # PyMuPDF
|
| 2 |
+
from fpdf import FPDF
|
| 3 |
+
import os
|
| 4 |
|
| 5 |
+
# Function to extract text and images from the PDF
|
| 6 |
+
def extract_content_from_pdf(pdf_path, output_dir):
|
| 7 |
+
doc = fitz.open(pdf_path)
|
| 8 |
+
if not os.path.exists(output_dir):
|
| 9 |
+
os.makedirs(output_dir)
|
| 10 |
|
| 11 |
+
for page_num in range(len(doc)):
|
| 12 |
+
page = doc.load_page(page_num)
|
| 13 |
+
# Save text
|
| 14 |
+
with open(f"{output_dir}/page_{page_num + 1}.txt", "w", encoding="utf-8") as text_file:
|
| 15 |
+
text_file.write(page.get_text())
|
| 16 |
# Extract images
|
| 17 |
images = page.get_images(full=True)
|
| 18 |
+
for img_index, img in enumerate(images):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
xref = img[0]
|
| 20 |
base_image = doc.extract_image(xref)
|
| 21 |
img_bytes = base_image["image"]
|
| 22 |
+
img_ext = base_image["ext"]
|
| 23 |
+
with open(f"{output_dir}/page_{page_num + 1}_img_{img_index + 1}.{img_ext}", "wb") as img_file:
|
| 24 |
+
img_file.write(img_bytes)
|
| 25 |
+
doc.close()
|
| 26 |
+
|
| 27 |
+
# Function to recreate the editable PDF
|
| 28 |
+
def recreate_pdf(output_dir, output_pdf_path):
|
| 29 |
+
pdf = FPDF()
|
| 30 |
+
for root, _, files in os.walk(output_dir):
|
| 31 |
+
for file in sorted(files):
|
| 32 |
+
if file.endswith(".txt"):
|
| 33 |
+
# Add text
|
| 34 |
+
page_path = os.path.join(root, file)
|
| 35 |
+
pdf.add_page()
|
| 36 |
+
with open(page_path, "r", encoding="utf-8") as text_file:
|
| 37 |
+
pdf.set_font("Arial", size=12)
|
| 38 |
+
lines = text_file.readlines()
|
| 39 |
+
for line in lines:
|
| 40 |
+
pdf.cell(0, 10, txt=line.strip(), ln=True)
|
| 41 |
+
elif file.endswith((".png", ".jpg", ".jpeg")):
|
| 42 |
+
# Add images
|
| 43 |
+
img_path = os.path.join(root, file)
|
| 44 |
+
pdf.image(img_path, x=10, y=pdf.get_y(), w=pdf.w - 20)
|
| 45 |
+
pdf.output(output_pdf_path)
|
| 46 |
+
|
| 47 |
+
# Paths
|
| 48 |
+
input_pdf_path = "Editable_Output (1).pdf"
|
| 49 |
+
output_directory = "output_content"
|
| 50 |
+
final_pdf_path = "Recreated_Editable.pdf"
|
| 51 |
|
| 52 |
+
# Process
|
| 53 |
+
extract_content_from_pdf(input_pdf_path, output_directory)
|
| 54 |
+
recreate_pdf(output_directory, final_pdf_path)
|
| 55 |
+
print(f"Recreated editable PDF saved at: {final_pdf_path}")
|