mbuckle commited on
Commit
04fbc97
Β·
1 Parent(s): 7fa5932

Version 9

Browse files
Files changed (1) hide show
  1. paddle_ocr_standalone.py +209 -126
paddle_ocr_standalone.py CHANGED
@@ -1,185 +1,262 @@
1
  #!/usr/bin/env python3
2
- # paddle_ocr_standalone.py - Fixed version with PDF to image conversion
3
 
4
  import sys
5
  import os
6
  import json
7
  import tempfile
 
8
 
9
- # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
10
- import fitz # PyMuPDF for PDF page counting
11
-
12
- if not hasattr(fitz.Document, 'pageCount'):
13
- def pageCount_property(self):
14
- return self.page_count
15
- fitz.Document.pageCount = property(pageCount_property)
16
-
17
- if not hasattr(fitz.Page, 'getPixmap'):
18
- def getPixmap(self, matrix=None, alpha=True):
19
- return self.get_pixmap(matrix=matrix, alpha=alpha)
20
- fitz.Page.getPixmap = getPixmap
21
-
22
- if not hasattr(fitz.Page, 'getText'):
23
- def getText(self, option="text"):
24
- return self.get_text(option)
25
- fitz.Page.getText = getText
26
-
27
- # NOW import PaddleOCR after applying the patches
28
- from paddleocr import PaddleOCR
29
 
30
- def pdf_to_images(pdf_path, dpi=200):
31
- """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
32
  try:
33
- doc = fitz.open(pdf_path)
34
- image_paths = []
35
-
36
- print(f"PDF has {len(doc)} pages", file=sys.stderr)
37
-
38
- for page_num in range(len(doc)):
39
- page = doc[page_num]
40
-
41
- # Create a transformation matrix for higher DPI
42
- mat = fitz.Matrix(dpi/72, dpi/72) # 200 DPI for better OCR accuracy
43
-
44
- # Render page to pixmap
45
- if hasattr(page, 'getPixmap'):
46
- pix = page.getPixmap(matrix=mat)
47
- else:
48
- pix = page.get_pixmap(matrix=mat)
49
-
50
- # Save to temporary file
51
- temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
52
- pix.save(temp_img_path)
53
-
54
- # Check if file was created and get its size
55
- if os.path.exists(temp_img_path):
56
- file_size = os.path.getsize(temp_img_path)
57
- print(f"Converted page {page_num + 1} to: {temp_img_path} (size: {file_size} bytes, dimensions: {pix.width}x{pix.height})", file=sys.stderr)
58
- else:
59
- print(f"Failed to create image file: {temp_img_path}", file=sys.stderr)
60
- continue
61
-
62
- image_paths.append(temp_img_path)
63
-
64
- doc.close()
65
- print(f"Successfully converted {len(image_paths)} pages to images", file=sys.stderr)
66
- return image_paths
67
-
68
  except Exception as e:
69
- print(f"Error converting PDF to images: {e}", file=sys.stderr)
70
- import traceback
71
- traceback.print_exc(file=sys.stderr)
72
- return []
73
-
74
- def cleanup_temp_files(file_paths):
75
- """Clean up temporary image files"""
76
- for file_path in file_paths:
77
- try:
78
- if os.path.exists(file_path):
79
- os.unlink(file_path)
80
- print(f"Cleaned up: {file_path}", file=sys.stderr)
81
- except Exception as e:
82
- print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
83
 
84
  # Check if file path was provided
85
  if len(sys.argv) < 2:
86
- result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
87
- print(json.dumps(result))
88
  sys.exit(1)
89
 
90
  file_path = sys.argv[1]
91
  temp_files = []
92
 
93
  try:
94
- # Print progress to stderr (like your local implementation)
95
- print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
96
-
97
- # Initialize PaddleOCR - try different settings for better text detection
98
- # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
99
- ocr = PaddleOCR(
100
- use_angle_cls=True,
101
- lang='en',
102
- show_log=False,
103
- det_model_dir=None, # Use default detection model
104
- rec_model_dir=None, # Use default recognition model
105
- use_gpu=False # Ensure CPU usage in serverless environment
106
- )
107
- print("PaddleOCR initialized successfully", file=sys.stderr)
108
-
109
- # Check if it's a PDF or image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  is_pdf = file_path.lower().endswith('.pdf')
111
 
112
  if is_pdf:
113
- print("Converting PDF to images for OCR processing...", file=sys.stderr)
114
- # Try lower DPI first to see if it helps
115
- image_paths = pdf_to_images(file_path, dpi=150) # Reduced from 200
116
  temp_files = image_paths
117
 
118
  if not image_paths:
119
- raise Exception("Failed to convert PDF to images")
120
 
121
  total_pages = len(image_paths)
 
122
  else:
123
- # For image files, use directly
124
  image_paths = [file_path]
125
  total_pages = 1
126
 
127
- print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
128
 
129
  # Process each image with OCR
 
130
  extracted_text = ""
131
  pages_processed = 0
132
 
133
  for i, img_path in enumerate(image_paths):
134
  try:
135
  current_page = i + 1
136
- print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
137
- print(f"Processing image: {img_path}", file=sys.stderr)
 
 
 
 
 
 
 
 
138
 
139
  # Run OCR on the image
 
140
  result = ocr.ocr(img_path, cls=True)
141
 
142
- # Debug: print what OCR returns
143
- print(f"OCR result for page {current_page}: {type(result)}, length: {len(result) if result else 'None'}", file=sys.stderr)
 
 
 
 
 
 
 
 
144
 
145
- if result and result[0]: # result is a list of pages, we have one page per image
146
- print(f"Page {current_page} has {len(result[0])} text lines detected", file=sys.stderr)
147
-
148
  pages_processed += 1
149
  page_text = ""
150
 
151
  for line_idx, line in enumerate(result[0]):
152
- if len(line) >= 2:
153
- text_content = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
154
- confidence = line[1][1] if isinstance(line[1], (list, tuple)) and len(line[1]) > 1 else 1.0
155
-
156
- print(f"Line {line_idx}: '{text_content}' (confidence: {confidence})", file=sys.stderr)
157
-
158
- if confidence > 0.3: # Lower confidence threshold for debugging
159
- page_text += text_content + "\n"
 
 
 
 
160
 
161
  if page_text.strip():
162
  extracted_text += f"\n--- Page {current_page} ---\n"
163
  extracted_text += page_text
164
- print(f"Page {current_page} text added: {len(page_text)} characters", file=sys.stderr)
165
  else:
166
- print(f"Page {current_page}: No text above confidence threshold", file=sys.stderr)
167
 
168
- print(f"Page {current_page} processed successfully", file=sys.stderr)
169
  else:
170
- print(f"No OCR results returned for page {current_page}", file=sys.stderr)
171
- if result:
172
- print(f"Result structure: {result}", file=sys.stderr)
173
 
174
  except Exception as page_error:
175
- print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
 
176
  continue
177
 
178
  # Clean up temporary files
179
  if temp_files:
 
180
  cleanup_temp_files(temp_files)
181
 
182
- # Output the final result as JSON to stdout
183
  result_data = {
184
  "success": True,
185
  "text": extracted_text,
@@ -188,21 +265,27 @@ try:
188
  "method": "pdf_to_images" if is_pdf else "direct_image"
189
  }
190
 
191
- print(json.dumps(result_data))
192
- print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
 
 
 
193
 
194
  except Exception as e:
195
  # Clean up on error
196
  if temp_files:
197
- cleanup_temp_files(temp_files)
198
-
199
- print(f"Error during OCR processing: {e}", file=sys.stderr)
200
- import traceback
 
 
 
201
  traceback.print_exc(file=sys.stderr)
202
 
203
  error_data = {
204
  "success": False,
205
  "error": str(e)
206
  }
207
- print(json.dumps(error_data))
208
  sys.exit(1)
 
1
  #!/usr/bin/env python3
2
+ # paddle_ocr_standalone.py - Robust version with comprehensive error handling
3
 
4
  import sys
5
  import os
6
  import json
7
  import tempfile
8
+ import traceback
9
 
10
+ def safe_print_stderr(message):
11
+ """Safely print to stderr"""
12
+ try:
13
+ print(message, file=sys.stderr, flush=True)
14
+ except:
15
+ pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ def safe_print_json(data):
18
+ """Safely print JSON to stdout"""
19
  try:
20
+ print(json.dumps(data), flush=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  except Exception as e:
22
+ safe_print_stderr(f"Error printing JSON: {e}")
23
+ print('{"success": false, "error": "JSON serialization failed"}')
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Check if file path was provided
26
  if len(sys.argv) < 2:
27
+ safe_print_json({"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"})
 
28
  sys.exit(1)
29
 
30
  file_path = sys.argv[1]
31
  temp_files = []
32
 
33
  try:
34
+ safe_print_stderr(f"=== Starting OCR processing for: {os.path.basename(file_path)} ===")
35
+
36
+ # Check if file exists and is readable
37
+ if not os.path.exists(file_path):
38
+ raise Exception(f"File does not exist: {file_path}")
39
+
40
+ if not os.access(file_path, os.R_OK):
41
+ raise Exception(f"File is not readable: {file_path}")
42
+
43
+ file_size = os.path.getsize(file_path)
44
+ safe_print_stderr(f"File size: {file_size} bytes")
45
+
46
+ # Import dependencies one by one with error handling
47
+ safe_print_stderr("Importing PyMuPDF...")
48
+ try:
49
+ import fitz
50
+ safe_print_stderr("βœ“ PyMuPDF imported successfully")
51
+ except Exception as e:
52
+ raise Exception(f"Failed to import PyMuPDF: {e}")
53
+
54
+ # Apply monkey patch for PyMuPDF compatibility
55
+ safe_print_stderr("Applying PyMuPDF compatibility patches...")
56
+ try:
57
+ if not hasattr(fitz.Document, 'pageCount'):
58
+ def pageCount_property(self):
59
+ return self.page_count
60
+ fitz.Document.pageCount = property(pageCount_property)
61
+ safe_print_stderr("βœ“ Added pageCount property")
62
+
63
+ if not hasattr(fitz.Page, 'getPixmap'):
64
+ def getPixmap(self, matrix=None, alpha=True):
65
+ return self.get_pixmap(matrix=matrix, alpha=alpha)
66
+ fitz.Page.getPixmap = getPixmap
67
+ safe_print_stderr("βœ“ Added getPixmap method")
68
+
69
+ if not hasattr(fitz.Page, 'getText'):
70
+ def getText(self, option="text"):
71
+ return self.get_text(option)
72
+ fitz.Page.getText = getText
73
+ safe_print_stderr("βœ“ Added getText method")
74
+
75
+ except Exception as e:
76
+ safe_print_stderr(f"Warning: Monkey patch failed: {e}")
77
+
78
+ # Test PDF opening
79
+ safe_print_stderr("Testing PDF opening...")
80
+ try:
81
+ test_doc = fitz.open(file_path)
82
+ page_count = len(test_doc)
83
+ safe_print_stderr(f"βœ“ PDF opened successfully, {page_count} pages detected")
84
+ test_doc.close()
85
+ except Exception as e:
86
+ raise Exception(f"Failed to open PDF: {e}")
87
+
88
+ # Import PaddleOCR
89
+ safe_print_stderr("Importing PaddleOCR...")
90
+ try:
91
+ from paddleocr import PaddleOCR
92
+ safe_print_stderr("βœ“ PaddleOCR imported successfully")
93
+ except Exception as e:
94
+ raise Exception(f"Failed to import PaddleOCR: {e}")
95
+
96
+ # Initialize PaddleOCR
97
+ safe_print_stderr("Initializing PaddleOCR...")
98
+ try:
99
+ ocr = PaddleOCR(
100
+ use_angle_cls=True,
101
+ lang='en',
102
+ show_log=False,
103
+ use_gpu=False
104
+ )
105
+ safe_print_stderr("βœ“ PaddleOCR initialized successfully")
106
+ except Exception as e:
107
+ raise Exception(f"Failed to initialize PaddleOCR: {e}")
108
+
109
+ def pdf_to_images(pdf_path, dpi=150):
110
+ """Convert PDF pages to images"""
111
+ try:
112
+ safe_print_stderr(f"Converting PDF to images (DPI: {dpi})...")
113
+ doc = fitz.open(pdf_path)
114
+ image_paths = []
115
+
116
+ safe_print_stderr(f"PDF has {len(doc)} pages")
117
+
118
+ for page_num in range(len(doc)):
119
+ try:
120
+ safe_print_stderr(f"Converting page {page_num + 1}...")
121
+ page = doc[page_num]
122
+
123
+ # Create transformation matrix
124
+ mat = fitz.Matrix(dpi/72, dpi/72)
125
+
126
+ # Render page to pixmap
127
+ if hasattr(page, 'getPixmap'):
128
+ pix = page.getPixmap(matrix=mat)
129
+ else:
130
+ pix = page.get_pixmap(matrix=mat)
131
+
132
+ # Save to temporary file
133
+ temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
134
+ pix.save(temp_img_path)
135
+
136
+ # Verify file creation
137
+ if os.path.exists(temp_img_path):
138
+ file_size = os.path.getsize(temp_img_path)
139
+ safe_print_stderr(f"βœ“ Page {page_num + 1} converted: {temp_img_path} (size: {file_size} bytes, {pix.width}x{pix.height})")
140
+ image_paths.append(temp_img_path)
141
+ else:
142
+ safe_print_stderr(f"βœ— Failed to create image: {temp_img_path}")
143
+
144
+ except Exception as page_error:
145
+ safe_print_stderr(f"βœ— Error converting page {page_num + 1}: {page_error}")
146
+ continue
147
+
148
+ doc.close()
149
+ safe_print_stderr(f"βœ“ Successfully converted {len(image_paths)}/{len(doc)} pages")
150
+ return image_paths
151
+
152
+ except Exception as e:
153
+ safe_print_stderr(f"βœ— PDF conversion failed: {e}")
154
+ traceback.print_exc(file=sys.stderr)
155
+ return []
156
+
157
+ def cleanup_temp_files(file_paths):
158
+ """Clean up temporary files"""
159
+ for file_path in file_paths:
160
+ try:
161
+ if os.path.exists(file_path):
162
+ os.unlink(file_path)
163
+ safe_print_stderr(f"βœ“ Cleaned up: {file_path}")
164
+ except Exception as e:
165
+ safe_print_stderr(f"Warning: Could not clean up {file_path}: {e}")
166
+
167
+ # Determine file type and convert if needed
168
  is_pdf = file_path.lower().endswith('.pdf')
169
 
170
  if is_pdf:
171
+ safe_print_stderr("Processing PDF file...")
172
+ image_paths = pdf_to_images(file_path)
 
173
  temp_files = image_paths
174
 
175
  if not image_paths:
176
+ raise Exception("PDF conversion produced no images")
177
 
178
  total_pages = len(image_paths)
179
+ safe_print_stderr(f"Will process {total_pages} images")
180
  else:
181
+ safe_print_stderr("Processing image file...")
182
  image_paths = [file_path]
183
  total_pages = 1
184
 
185
+ safe_print_stderr(f"TOTAL_PAGES:{total_pages}")
186
 
187
  # Process each image with OCR
188
+ safe_print_stderr("Starting OCR processing...")
189
  extracted_text = ""
190
  pages_processed = 0
191
 
192
  for i, img_path in enumerate(image_paths):
193
  try:
194
  current_page = i + 1
195
+ safe_print_stderr(f"CURRENT_PAGE:{current_page}")
196
+ safe_print_stderr(f"Processing image: {img_path}")
197
+
198
+ # Verify image exists and is readable
199
+ if not os.path.exists(img_path):
200
+ safe_print_stderr(f"βœ— Image file does not exist: {img_path}")
201
+ continue
202
+
203
+ img_size = os.path.getsize(img_path)
204
+ safe_print_stderr(f"Image size: {img_size} bytes")
205
 
206
  # Run OCR on the image
207
+ safe_print_stderr(f"Running OCR on page {current_page}...")
208
  result = ocr.ocr(img_path, cls=True)
209
 
210
+ safe_print_stderr(f"OCR result type: {type(result)}")
211
+ if result:
212
+ safe_print_stderr(f"OCR result length: {len(result)}")
213
+ if result[0]:
214
+ safe_print_stderr(f"Page {current_page} has {len(result[0])} text regions detected")
215
+ else:
216
+ safe_print_stderr(f"Page {current_page}: OCR returned empty result")
217
+ else:
218
+ safe_print_stderr(f"Page {current_page}: OCR returned None")
219
+ continue
220
 
221
+ if result and result[0]:
 
 
222
  pages_processed += 1
223
  page_text = ""
224
 
225
  for line_idx, line in enumerate(result[0]):
226
+ try:
227
+ if len(line) >= 2:
228
+ text_content = line[1][0] if isinstance(line[1], (list, tuple)) else str(line[1])
229
+ confidence = line[1][1] if isinstance(line[1], (list, tuple)) and len(line[1]) > 1 else 1.0
230
+
231
+ safe_print_stderr(f"Line {line_idx}: '{text_content}' (confidence: {confidence:.2f})")
232
+
233
+ if confidence > 0.3:
234
+ page_text += text_content + "\n"
235
+ except Exception as line_error:
236
+ safe_print_stderr(f"Error processing line {line_idx}: {line_error}")
237
+ continue
238
 
239
  if page_text.strip():
240
  extracted_text += f"\n--- Page {current_page} ---\n"
241
  extracted_text += page_text
242
+ safe_print_stderr(f"βœ“ Page {current_page}: Added {len(page_text)} characters of text")
243
  else:
244
+ safe_print_stderr(f"Page {current_page}: No text above confidence threshold")
245
 
 
246
  else:
247
+ safe_print_stderr(f"Page {current_page}: No OCR results")
 
 
248
 
249
  except Exception as page_error:
250
+ safe_print_stderr(f"βœ— Error processing page {current_page}: {page_error}")
251
+ traceback.print_exc(file=sys.stderr)
252
  continue
253
 
254
  # Clean up temporary files
255
  if temp_files:
256
+ safe_print_stderr("Cleaning up temporary files...")
257
  cleanup_temp_files(temp_files)
258
 
259
+ # Prepare final result
260
  result_data = {
261
  "success": True,
262
  "text": extracted_text,
 
265
  "method": "pdf_to_images" if is_pdf else "direct_image"
266
  }
267
 
268
+ safe_print_stderr(f"=== OCR Complete: {pages_processed}/{total_pages} pages processed ===")
269
+ safe_print_stderr(f"Total text length: {len(extracted_text)} characters")
270
+
271
+ # Output final JSON result
272
+ safe_print_json(result_data)
273
 
274
  except Exception as e:
275
  # Clean up on error
276
  if temp_files:
277
+ try:
278
+ cleanup_temp_files(temp_files)
279
+ except:
280
+ pass
281
+
282
+ safe_print_stderr(f"=== FATAL ERROR ===")
283
+ safe_print_stderr(f"Error: {e}")
284
  traceback.print_exc(file=sys.stderr)
285
 
286
  error_data = {
287
  "success": False,
288
  "error": str(e)
289
  }
290
+ safe_print_json(error_data)
291
  sys.exit(1)