datasciencesage commited on
Commit
5b2476c
·
verified ·
1 Parent(s): 10acd92

Create step1_get_images.py

Browse files
Files changed (1) hide show
  1. step1_get_images.py +139 -0
step1_get_images.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ from pathlib import Path
4
+ from pdf2image import convert_from_path
5
+ from tqdm import tqdm
6
+ from PIL import Image
7
+
8
+
9
+ # Create directories
10
+
11
+
12
+
13
+ def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
14
+ try:
15
+ command = [
16
+ "soffice",
17
+ "--headless",
18
+ "--convert-to",
19
+ "pdf",
20
+ "--outdir",
21
+ str(temp_pdf_dir),
22
+ str(docx_path)
23
+ ]
24
+ result = subprocess.run(
25
+ command,
26
+ stdout=subprocess.PIPE,
27
+ stderr=subprocess.PIPE,
28
+ text=True,
29
+ timeout=60 # Add timeout
30
+ )
31
+ if result.returncode == 0 and os.path.exists(output_pdf_path):
32
+ print(f"✅ Converted to PDF: {output_pdf_path}")
33
+ return True
34
+ else:
35
+ print(f"❌ Error converting {docx_path}: {result.stderr}")
36
+ return False
37
+ except FileNotFoundError:
38
+ print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
39
+ return False
40
+ except Exception as e:
41
+ print(f"❌ Error converting {docx_path}: {str(e)}")
42
+ return False
43
+
44
+ def pdf_to_images(pdf_path, output_base_path):
45
+ """Convert PDF to images with validation"""
46
+ try:
47
+ # Convert all pages with higher DPI for better quality
48
+ images = convert_from_path(
49
+ pdf_path,
50
+ dpi=300, # High DPI for math clarity
51
+ fmt='png',
52
+ thread_count=4 # Parallel processing
53
+ )
54
+
55
+ if not images:
56
+ print(f"⚠️ No pages found in {pdf_path}")
57
+ return 0
58
+
59
+ saved_count = 0
60
+ for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
61
+ output_image_path = output_base_path.with_name(
62
+ f"{output_base_path.stem}_page{page_num}.png"
63
+ )
64
+
65
+ # Validate image dimensions
66
+ width, height = image.size
67
+ if width <= 0 or height <= 0:
68
+ print(f"⚠️ Skipping page {page_num}: Invalid dimensions ({width}x{height})")
69
+ continue
70
+
71
+ # Additional validation: check if image is blank
72
+ if width < 50 or height < 50:
73
+ print(f"⚠️ Skipping page {page_num}: Too small ({width}x{height})")
74
+ continue
75
+
76
+ # Save with optimization
77
+ image.save(output_image_path, "PNG", optimize=True)
78
+ saved_count += 1
79
+
80
+ print(f"✅ Saved {saved_count}/{len(images)} pages")
81
+ return saved_count
82
+
83
+ except Exception as e:
84
+ print(f"❌ Error processing {pdf_path}: {str(e)}")
85
+ return 0
86
+
87
+ # Process all .docx and .pdf files
88
+ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/",OUTPUT_PATH_OF_SCREENSHOTS = "images/"):
89
+ total_processed = 0
90
+ total_images = 0
91
+ INPUT_PATH_OF_DOCS = INPUT_PATH_OF_DOCS
92
+ TEMP_PDF_PATH = TEMP_PDF_PATH
93
+ OUTPUT_PATH_OF_SCREENSHOTS = OUTPUT_PATH_OF_SCREENSHOTS
94
+ temp_pdf_dir = Path(TEMP_PDF_PATH)
95
+ temp_pdf_dir.mkdir(parents=True, exist_ok=True)
96
+ output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
97
+ output_dir.mkdir(parents=True, exist_ok=True)
98
+
99
+
100
+
101
+ for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
102
+ whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
103
+
104
+ if os.path.isfile(whole_path):
105
+ output_base_path = output_dir / Path(paths).stem
106
+
107
+ if paths.lower().endswith('.docx'):
108
+ print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
109
+ temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
110
+
111
+ if docx_to_pdf(whole_path, temp_pdf_path,temp_pdf_dir):
112
+ print("📸 Converting to images...")
113
+ count = pdf_to_images(temp_pdf_path, output_base_path)
114
+ total_images += count
115
+ total_processed += 1
116
+
117
+ elif paths.lower().endswith('.pdf'):
118
+ print(f"\n📄 Processing .pdf: {paths} (Document #{idx})")
119
+ count = pdf_to_images(whole_path, output_base_path)
120
+ total_images += count
121
+ total_processed += 1
122
+
123
+ print(f"\n{'='*50}")
124
+ print(f"📊 CONVERSION SUMMARY")
125
+ print(f"{'='*50}")
126
+ print(f"Documents processed: {total_processed}")
127
+ print(f"Total images saved: {total_images}")
128
+ print(f"{'='*50}")
129
+
130
+ print("\n🧹 Cleaning up temporary files...")
131
+ for temp_pdf in temp_pdf_dir.glob("*.pdf"):
132
+ try:
133
+ temp_pdf.unlink()
134
+ print(f"✅ Deleted: {temp_pdf.name}")
135
+ except Exception as e:
136
+ print(f"❌ Error deleting {temp_pdf}: {str(e)}")
137
+
138
+ if __name__=="__main__":
139
+ get_images()