lokesh341 commited on
Commit
d6d9719
·
verified ·
1 Parent(s): e1267a8

Rename PyMuPDF (fitz) to PyMuPDF

Browse files
Files changed (2) hide show
  1. PyMuPDF +56 -0
  2. PyMuPDF (fitz) +0 -55
PyMuPDF ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from reportlab.pdfgen import canvas
3
+ from reportlab.lib.pagesizes import letter
4
+ import io
5
+
6
+ def extract_and_recreate_pdf(input_pdf, output_pdf):
7
+ # Open the input PDF
8
+ doc = fitz.open(input_pdf)
9
+
10
+ # List to store extracted images
11
+ extracted_images = []
12
+ output_streams = []
13
+
14
+ # Extract text and images from each page
15
+ for page_num in range(len(doc)):
16
+ page = doc[page_num]
17
+ text = page.get_text("text") # Extract text
18
+ images = page.get_images(full=True)
19
+
20
+ # Extract images
21
+ for img_index, img in enumerate(images):
22
+ xref = img[0]
23
+ base_image = doc.extract_image(xref)
24
+ image_bytes = base_image["image"]
25
+ extracted_images.append(io.BytesIO(image_bytes))
26
+
27
+ # Store text and image streams for later
28
+ output_streams.append((text, extracted_images))
29
+
30
+ # Generate the new PDF
31
+ c = canvas.Canvas(output_pdf, pagesize=letter)
32
+ width, height = letter
33
+
34
+ # Add extracted content to the new PDF
35
+ for page_num, (text, images) in enumerate(output_streams):
36
+ c.drawString(50, height - 50, f"Page {page_num + 1}") # Page number
37
+ c.drawString(50, height - 100, text) # Add text
38
+
39
+ # Add images
40
+ for idx, img_stream in enumerate(images):
41
+ img_stream.seek(0)
42
+ img = fitz.Pixmap(img_stream)
43
+ if img.n >= 4: # Convert CMYK to RGB
44
+ img = fitz.Pixmap(fitz.csRGB, img)
45
+ img.save(f"temp_image_{idx}.png") # Save as temp PNG
46
+ c.drawImage(f"temp_image_{idx}.png", 50, height - 300 - (idx * 200), width=200, height=200)
47
+
48
+ c.showPage() # Add new page
49
+
50
+ c.save()
51
+
52
+ # Specify input and output file paths
53
+ input_pdf_path = "input.pdf" # Replace with your uploaded PDF file
54
+ output_pdf_path = "output_editable.pdf"
55
+ extract_and_recreate_pdf(input_pdf_path, output_pdf_path)
56
+ print("Editable PDF created successfully.")
PyMuPDF (fitz) DELETED
@@ -1,55 +0,0 @@
1
- import fitz # PyMuPDF
2
- from fpdf import FPDF
3
- import os
4
-
5
- # Function to extract text and images from the PDF
6
- def extract_content_from_pdf(pdf_path, output_dir):
7
- doc = fitz.open(pdf_path)
8
- if not os.path.exists(output_dir):
9
- os.makedirs(output_dir)
10
-
11
- for page_num in range(len(doc)):
12
- page = doc.load_page(page_num)
13
- # Save text
14
- with open(f"{output_dir}/page_{page_num + 1}.txt", "w", encoding="utf-8") as text_file:
15
- text_file.write(page.get_text())
16
- # Extract images
17
- images = page.get_images(full=True)
18
- for img_index, img in enumerate(images):
19
- xref = img[0]
20
- base_image = doc.extract_image(xref)
21
- img_bytes = base_image["image"]
22
- img_ext = base_image["ext"]
23
- with open(f"{output_dir}/page_{page_num + 1}_img_{img_index + 1}.{img_ext}", "wb") as img_file:
24
- img_file.write(img_bytes)
25
- doc.close()
26
-
27
- # Function to recreate the editable PDF
28
- def recreate_pdf(output_dir, output_pdf_path):
29
- pdf = FPDF()
30
- for root, _, files in os.walk(output_dir):
31
- for file in sorted(files):
32
- if file.endswith(".txt"):
33
- # Add text
34
- page_path = os.path.join(root, file)
35
- pdf.add_page()
36
- with open(page_path, "r", encoding="utf-8") as text_file:
37
- pdf.set_font("Arial", size=12)
38
- lines = text_file.readlines()
39
- for line in lines:
40
- pdf.cell(0, 10, txt=line.strip(), ln=True)
41
- elif file.endswith((".png", ".jpg", ".jpeg")):
42
- # Add images
43
- img_path = os.path.join(root, file)
44
- pdf.image(img_path, x=10, y=pdf.get_y(), w=pdf.w - 20)
45
- pdf.output(output_pdf_path)
46
-
47
- # Paths
48
- input_pdf_path = "Editable_Output (1).pdf"
49
- output_directory = "output_content"
50
- final_pdf_path = "Recreated_Editable.pdf"
51
-
52
- # Process
53
- extract_content_from_pdf(input_pdf_path, output_directory)
54
- recreate_pdf(output_directory, final_pdf_path)
55
- print(f"Recreated editable PDF saved at: {final_pdf_path}")