datasciencesage commited on
Commit
533248a
·
verified ·
1 Parent(s): 3375af0

Update step1_get_images.py

Browse files
Files changed (1) hide show
  1. step1_get_images.py +27 -38
step1_get_images.py CHANGED
@@ -5,12 +5,8 @@ from pdf2image import convert_from_path
5
  from tqdm import tqdm
6
  from PIL import Image
7
 
8
-
9
- # Create directories
10
-
11
-
12
-
13
- def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
14
  try:
15
  command = [
16
  "soffice",
@@ -21,19 +17,22 @@ def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
21
  str(temp_pdf_dir),
22
  str(docx_path)
23
  ]
 
24
  result = subprocess.run(
25
  command,
26
  stdout=subprocess.PIPE,
27
  stderr=subprocess.PIPE,
28
  text=True,
29
- timeout=60 # Add timeout
30
  )
 
31
  if result.returncode == 0 and os.path.exists(output_pdf_path):
32
  print(f"✅ Converted to PDF: {output_pdf_path}")
33
  return True
34
  else:
35
  print(f"❌ Error converting {docx_path}: {result.stderr}")
36
  return False
 
37
  except FileNotFoundError:
38
  print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
39
  return False
@@ -42,61 +41,52 @@ def docx_to_pdf(docx_path, output_pdf_path,temp_pdf_dir):
42
  return False
43
 
44
  def pdf_to_images(pdf_path, output_base_path):
45
- """Convert PDF to images with validation"""
46
  try:
47
- # Convert all pages with higher DPI for better quality
48
  images = convert_from_path(
49
  pdf_path,
50
- dpi=300, # High DPI for math clarity
51
  fmt='png',
52
- thread_count=4 # Parallel processing
53
  )
54
-
55
  if not images:
56
- print(f"⚠️ No pages found in {pdf_path}")
57
  return 0
58
-
59
  saved_count = 0
60
  for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
61
  output_image_path = output_base_path.with_name(
62
  f"{output_base_path.stem}_page{page_num}.png"
63
  )
64
-
65
- # Validate image dimensions
66
  width, height = image.size
67
- if width <= 0 or height <= 0:
68
- print(f"⚠️ Skipping page {page_num}: Invalid dimensions ({width}x{height})")
69
- continue
70
-
71
- # Additional validation: check if image is blank
72
- if width < 50 or height < 50:
73
- print(f"⚠️ Skipping page {page_num}: Too small ({width}x{height})")
74
  continue
75
-
76
- # Save with optimization
77
  image.save(output_image_path, "PNG", optimize=True)
78
  saved_count += 1
79
-
80
  print(f"✅ Saved {saved_count}/{len(images)} pages")
81
  return saved_count
82
-
83
  except Exception as e:
84
  print(f"❌ Error processing {pdf_path}: {str(e)}")
85
  return 0
86
 
87
- # Process all .docx and .pdf files
88
- def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/",OUTPUT_PATH_OF_SCREENSHOTS = "images/"):
 
 
89
  total_processed = 0
90
  total_images = 0
91
- INPUT_PATH_OF_DOCS = INPUT_PATH_OF_DOCS
92
- TEMP_PDF_PATH = TEMP_PDF_PATH
93
- OUTPUT_PATH_OF_SCREENSHOTS = OUTPUT_PATH_OF_SCREENSHOTS
94
  temp_pdf_dir = Path(TEMP_PDF_PATH)
95
  temp_pdf_dir.mkdir(parents=True, exist_ok=True)
 
96
  output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
97
  output_dir.mkdir(parents=True, exist_ok=True)
98
-
99
-
100
 
101
  for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
102
  whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
@@ -108,7 +98,7 @@ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/
108
  print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
109
  temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
110
 
111
- if docx_to_pdf(whole_path, temp_pdf_path,temp_pdf_dir):
112
  print("📸 Converting to images...")
113
  count = pdf_to_images(temp_pdf_path, output_base_path)
114
  total_images += count
@@ -127,6 +117,7 @@ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/
127
  print(f"Total images saved: {total_images}")
128
  print(f"{'='*50}")
129
 
 
130
  print("\n🧹 Cleaning up temporary files...")
131
  for temp_pdf in temp_pdf_dir.glob("*.pdf"):
132
  try:
@@ -134,6 +125,4 @@ def get_images(INPUT_PATH_OF_DOCS = "all_documents/",TEMP_PDF_PATH = "temp_pdfs/
134
  print(f"✅ Deleted: {temp_pdf.name}")
135
  except Exception as e:
136
  print(f"❌ Error deleting {temp_pdf}: {str(e)}")
137
-
138
- if __name__=="__main__":
139
- get_images()
 
5
  from tqdm import tqdm
6
  from PIL import Image
7
 
8
+ def docx_to_pdf(docx_path, output_pdf_path, temp_pdf_dir):
9
+ """Convert DOCX to PDF using LibreOffice"""
 
 
 
 
10
  try:
11
  command = [
12
  "soffice",
 
17
  str(temp_pdf_dir),
18
  str(docx_path)
19
  ]
20
+
21
  result = subprocess.run(
22
  command,
23
  stdout=subprocess.PIPE,
24
  stderr=subprocess.PIPE,
25
  text=True,
26
+ timeout=60
27
  )
28
+
29
  if result.returncode == 0 and os.path.exists(output_pdf_path):
30
  print(f"✅ Converted to PDF: {output_pdf_path}")
31
  return True
32
  else:
33
  print(f"❌ Error converting {docx_path}: {result.stderr}")
34
  return False
35
+
36
  except FileNotFoundError:
37
  print("❌ Error: 'soffice' not found. Ensure LibreOffice is installed.")
38
  return False
 
41
  return False
42
 
43
  def pdf_to_images(pdf_path, output_base_path):
44
+ """Convert PDF pages to high-quality PNG images"""
45
  try:
 
46
  images = convert_from_path(
47
  pdf_path,
48
+ dpi=300, # High quality for math equations
49
  fmt='png',
50
+ thread_count=4
51
  )
52
+
53
  if not images:
54
+ print(f"⚠️ No pages found in {pdf_path}")
55
  return 0
56
+
57
  saved_count = 0
58
  for page_num, image in enumerate(tqdm(images, desc="Converting pages"), 1):
59
  output_image_path = output_base_path.with_name(
60
  f"{output_base_path.stem}_page{page_num}.png"
61
  )
62
+
 
63
  width, height = image.size
64
+ if width <= 0 or height <= 0 or width < 50 or height < 50:
65
+ print(f"⚠️ Skipping page {page_num}: Invalid dimensions ({width}x{height})")
 
 
 
 
 
66
  continue
67
+
 
68
  image.save(output_image_path, "PNG", optimize=True)
69
  saved_count += 1
70
+
71
  print(f"✅ Saved {saved_count}/{len(images)} pages")
72
  return saved_count
73
+
74
  except Exception as e:
75
  print(f"❌ Error processing {pdf_path}: {str(e)}")
76
  return 0
77
 
78
+ def get_images(INPUT_PATH_OF_DOCS="all_documents/",
79
+ TEMP_PDF_PATH="temp_pdfs/",
80
+ OUTPUT_PATH_OF_SCREENSHOTS="images/"):
81
+ """Main function to convert all documents to images"""
82
  total_processed = 0
83
  total_images = 0
84
+
 
 
85
  temp_pdf_dir = Path(TEMP_PDF_PATH)
86
  temp_pdf_dir.mkdir(parents=True, exist_ok=True)
87
+
88
  output_dir = Path(OUTPUT_PATH_OF_SCREENSHOTS)
89
  output_dir.mkdir(parents=True, exist_ok=True)
 
 
90
 
91
  for idx, paths in enumerate(os.listdir(INPUT_PATH_OF_DOCS), start=1):
92
  whole_path = os.path.join(INPUT_PATH_OF_DOCS, paths)
 
98
  print(f"\n📄 Processing .docx: {paths} (Document #{idx})")
99
  temp_pdf_path = temp_pdf_dir / f"{Path(paths).stem}.pdf"
100
 
101
+ if docx_to_pdf(whole_path, temp_pdf_path, temp_pdf_dir):
102
  print("📸 Converting to images...")
103
  count = pdf_to_images(temp_pdf_path, output_base_path)
104
  total_images += count
 
117
  print(f"Total images saved: {total_images}")
118
  print(f"{'='*50}")
119
 
120
+ # Cleanup temp PDFs
121
  print("\n🧹 Cleaning up temporary files...")
122
  for temp_pdf in temp_pdf_dir.glob("*.pdf"):
123
  try:
 
125
  print(f"✅ Deleted: {temp_pdf.name}")
126
  except Exception as e:
127
  print(f"❌ Error deleting {temp_pdf}: {str(e)}")
128
+