lokesh341 commited on
Commit
e1267a8
·
verified ·
1 Parent(s): e619894

Update PyMuPDF (fitz)

Browse files
Files changed (1) hide show
  1. PyMuPDF (fitz) +46 -20
PyMuPDF (fitz) CHANGED
@@ -1,29 +1,55 @@
1
  import fitz # PyMuPDF
 
 
2
 
3
- def extract_and_recreate_pdf(input_path, output_path):
4
- doc = fitz.open(input_path)
5
- new_doc = fitz.open()
 
 
6
 
7
- for page in doc:
8
- # Extract text
9
- text = page.get_text("text")
 
 
10
  # Extract images
11
  images = page.get_images(full=True)
12
- page_rect = page.rect
13
-
14
- # Create new page
15
- new_page = new_doc.new_page(width=page_rect.width, height=page_rect.height)
16
-
17
- # Add text to the new page
18
- new_page.insert_text((10, 10), text, fontsize=12)
19
-
20
- # Add images to the new page
21
- for img in images:
22
  xref = img[0]
23
  base_image = doc.extract_image(xref)
24
  img_bytes = base_image["image"]
25
- img_rect = fitz.Rect(50, 50, 300, 300) # Example position
26
- pix = fitz.Pixmap(img_bytes)
27
- new_page.insert_image(img_rect, pixmap=pix)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
- new_doc.save(output_path)
 
 
 
 
1
  import fitz # PyMuPDF
2
+ from fpdf import FPDF
3
+ import os
4
 
5
+ # Function to extract text and images from the PDF
6
+ def extract_content_from_pdf(pdf_path, output_dir):
7
+ doc = fitz.open(pdf_path)
8
+ if not os.path.exists(output_dir):
9
+ os.makedirs(output_dir)
10
 
11
+ for page_num in range(len(doc)):
12
+ page = doc.load_page(page_num)
13
+ # Save text
14
+ with open(f"{output_dir}/page_{page_num + 1}.txt", "w", encoding="utf-8") as text_file:
15
+ text_file.write(page.get_text())
16
  # Extract images
17
  images = page.get_images(full=True)
18
+ for img_index, img in enumerate(images):
 
 
 
 
 
 
 
 
 
19
  xref = img[0]
20
  base_image = doc.extract_image(xref)
21
  img_bytes = base_image["image"]
22
+ img_ext = base_image["ext"]
23
+ with open(f"{output_dir}/page_{page_num + 1}_img_{img_index + 1}.{img_ext}", "wb") as img_file:
24
+ img_file.write(img_bytes)
25
+ doc.close()
26
+
27
+ # Function to recreate the editable PDF
28
+ def recreate_pdf(output_dir, output_pdf_path):
29
+ pdf = FPDF()
30
+ for root, _, files in os.walk(output_dir):
31
+ for file in sorted(files):
32
+ if file.endswith(".txt"):
33
+ # Add text
34
+ page_path = os.path.join(root, file)
35
+ pdf.add_page()
36
+ with open(page_path, "r", encoding="utf-8") as text_file:
37
+ pdf.set_font("Arial", size=12)
38
+ lines = text_file.readlines()
39
+ for line in lines:
40
+ pdf.cell(0, 10, txt=line.strip(), ln=True)
41
+ elif file.endswith((".png", ".jpg", ".jpeg")):
42
+ # Add images
43
+ img_path = os.path.join(root, file)
44
+ pdf.image(img_path, x=10, y=pdf.get_y(), w=pdf.w - 20)
45
+ pdf.output(output_pdf_path)
46
+
47
+ # Paths
48
+ input_pdf_path = "Editable_Output (1).pdf"
49
+ output_directory = "output_content"
50
+ final_pdf_path = "Recreated_Editable.pdf"
51
 
52
+ # Process
53
+ extract_content_from_pdf(input_pdf_path, output_directory)
54
+ recreate_pdf(output_directory, final_pdf_path)
55
+ print(f"Recreated editable PDF saved at: {final_pdf_path}")