mbuckle commited on
Commit
5745364
·
1 Parent(s): 6dc8e9d

PDF to image conversion fix

Browse files
Files changed (1) hide show
  1. paddle_ocr_standalone.py +92 -30
paddle_ocr_standalone.py CHANGED
@@ -1,9 +1,10 @@
1
  #!/usr/bin/env python3
2
- # paddle_ocr_standalone.py - Standalone script that mirrors your local implementation
3
 
4
  import sys
5
  import os
6
  import json
 
7
 
8
  # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
9
  import fitz # PyMuPDF for PDF page counting
@@ -26,6 +27,48 @@ if not hasattr(fitz.Page, 'getText'):
26
  # NOW import PaddleOCR after applying the patches
27
  from paddleocr import PaddleOCR
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Check if file path was provided
30
  if len(sys.argv) < 2:
31
  result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
@@ -33,71 +76,90 @@ if len(sys.argv) < 2:
33
  sys.exit(1)
34
 
35
  file_path = sys.argv[1]
 
36
 
37
  try:
38
  # Print progress to stderr (like your local implementation)
39
  print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
40
 
41
  # Initialize PaddleOCR - exactly like your local implementation
 
42
  ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
43
  print("PaddleOCR initialized successfully", file=sys.stderr)
44
 
45
- # Count total pages if it's a PDF
46
- def count_pdf_pages(file_path):
47
- try:
48
- if file_path.lower().endswith('.pdf'):
49
- doc = fitz.open(file_path)
50
- page_count = len(doc)
51
- doc.close()
52
- return page_count
53
- else:
54
- return 1 # Images are considered as 1 page
55
- except Exception as e:
56
- print(f"Error counting pages: {e}", file=sys.stderr)
57
- return 1 # Default to 1 if we can't determine
58
 
59
- # Get total pages
60
- total_pages = count_pdf_pages(file_path)
61
- print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
 
 
 
 
 
 
 
 
 
 
62
 
63
- # Process the file - exactly like your local implementation
64
- print(f"Running OCR on file: {file_path}", file=sys.stderr)
65
- result = ocr.ocr(file_path, cls=True)
66
- print("OCR processing completed", file=sys.stderr)
67
 
68
- # Extract text and output results
69
  extracted_text = ""
70
  pages_processed = 0
71
 
72
- if result:
73
- # Print recognized text with page information
74
- for page_idx, page_result in enumerate(result):
75
- current_page = page_idx + 1
76
  print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
 
 
 
 
77
 
78
- if page_result:
79
  pages_processed += 1
80
  page_text = ""
81
- for line in page_result:
82
- if len(line) >= 2:
 
83
  page_text += line[1][0] + "\n"
84
 
85
  if page_text.strip():
86
  extracted_text += f"\n--- Page {current_page} ---\n"
87
  extracted_text += page_text
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Output the final result as JSON to stdout
90
  result_data = {
91
  "success": True,
92
  "text": extracted_text,
93
  "total_pages": total_pages,
94
- "pages_processed": pages_processed
 
95
  }
96
 
97
  print(json.dumps(result_data))
98
  print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
99
 
100
  except Exception as e:
 
 
 
 
101
  print(f"Error during OCR processing: {e}", file=sys.stderr)
102
  import traceback
103
  traceback.print_exc(file=sys.stderr)
 
1
  #!/usr/bin/env python3
2
+ # paddle_ocr_standalone.py - Fixed version with PDF to image conversion
3
 
4
  import sys
5
  import os
6
  import json
7
+ import tempfile
8
 
9
  # Apply monkey patch for PyMuPDF compatibility BEFORE importing anything
10
  import fitz # PyMuPDF for PDF page counting
 
27
  # NOW import PaddleOCR after applying the patches
28
  from paddleocr import PaddleOCR
29
 
30
+ def pdf_to_images(pdf_path, dpi=200):
31
+ """Convert PDF pages to images since PaddleOCR can't read PDFs directly"""
32
+ try:
33
+ doc = fitz.open(pdf_path)
34
+ image_paths = []
35
+
36
+ for page_num in range(len(doc)):
37
+ page = doc[page_num]
38
+
39
+ # Create a transformation matrix for higher DPI
40
+ mat = fitz.Matrix(dpi/72, dpi/72) # 200 DPI for better OCR accuracy
41
+
42
+ # Render page to pixmap
43
+ if hasattr(page, 'getPixmap'):
44
+ pix = page.getPixmap(matrix=mat)
45
+ else:
46
+ pix = page.get_pixmap(matrix=mat)
47
+
48
+ # Save to temporary file
49
+ temp_img_path = f"/tmp/ocr_page_{page_num}_{os.getpid()}.png"
50
+ pix.save(temp_img_path)
51
+ image_paths.append(temp_img_path)
52
+
53
+ print(f"Converted page {page_num + 1} to: {temp_img_path}", file=sys.stderr)
54
+
55
+ doc.close()
56
+ return image_paths
57
+
58
+ except Exception as e:
59
+ print(f"Error converting PDF to images: {e}", file=sys.stderr)
60
+ return []
61
+
62
+ def cleanup_temp_files(file_paths):
63
+ """Clean up temporary image files"""
64
+ for file_path in file_paths:
65
+ try:
66
+ if os.path.exists(file_path):
67
+ os.unlink(file_path)
68
+ print(f"Cleaned up: {file_path}", file=sys.stderr)
69
+ except Exception as e:
70
+ print(f"Warning: Could not clean up {file_path}: {e}", file=sys.stderr)
71
+
72
  # Check if file path was provided
73
  if len(sys.argv) < 2:
74
  result = {"success": False, "error": "Usage: python paddle_ocr_standalone.py <file_path>"}
 
76
  sys.exit(1)
77
 
78
  file_path = sys.argv[1]
79
+ temp_files = []
80
 
81
  try:
82
  # Print progress to stderr (like your local implementation)
83
  print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
84
 
85
  # Initialize PaddleOCR - exactly like your local implementation
86
+ # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
87
  ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
88
  print("PaddleOCR initialized successfully", file=sys.stderr)
89
 
90
+ # Check if it's a PDF or image
91
+ is_pdf = file_path.lower().endswith('.pdf')
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ if is_pdf:
94
+ print("Converting PDF to images for OCR processing...", file=sys.stderr)
95
+ image_paths = pdf_to_images(file_path)
96
+ temp_files = image_paths
97
+
98
+ if not image_paths:
99
+ raise Exception("Failed to convert PDF to images")
100
+
101
+ total_pages = len(image_paths)
102
+ else:
103
+ # For image files, use directly
104
+ image_paths = [file_path]
105
+ total_pages = 1
106
 
107
+ print(f"TOTAL_PAGES:{total_pages}", file=sys.stderr)
 
 
 
108
 
109
+ # Process each image with OCR
110
  extracted_text = ""
111
  pages_processed = 0
112
 
113
+ for i, img_path in enumerate(image_paths):
114
+ try:
115
+ current_page = i + 1
 
116
  print(f"CURRENT_PAGE:{current_page}", file=sys.stderr)
117
+ print(f"Processing image: {img_path}", file=sys.stderr)
118
+
119
+ # Run OCR on the image
120
+ result = ocr.ocr(img_path, cls=True)
121
 
122
+ if result and result[0]: # result is a list of pages, we have one page per image
123
  pages_processed += 1
124
  page_text = ""
125
+
126
+ for line in result[0]:
127
+ if len(line) >= 2 and line[1][1] > 0.5: # confidence threshold
128
  page_text += line[1][0] + "\n"
129
 
130
  if page_text.strip():
131
  extracted_text += f"\n--- Page {current_page} ---\n"
132
  extracted_text += page_text
133
+
134
+ print(f"Page {current_page} processed successfully", file=sys.stderr)
135
+ else:
136
+ print(f"No text found on page {current_page}", file=sys.stderr)
137
+
138
+ except Exception as page_error:
139
+ print(f"Error processing page {current_page}: {page_error}", file=sys.stderr)
140
+ continue
141
+
142
+ # Clean up temporary files
143
+ if temp_files:
144
+ cleanup_temp_files(temp_files)
145
 
146
  # Output the final result as JSON to stdout
147
  result_data = {
148
  "success": True,
149
  "text": extracted_text,
150
  "total_pages": total_pages,
151
+ "pages_processed": pages_processed,
152
+ "method": "pdf_to_images" if is_pdf else "direct_image"
153
  }
154
 
155
  print(json.dumps(result_data))
156
  print(f"Successfully processed {pages_processed}/{total_pages} pages", file=sys.stderr)
157
 
158
  except Exception as e:
159
+ # Clean up on error
160
+ if temp_files:
161
+ cleanup_temp_files(temp_files)
162
+
163
  print(f"Error during OCR processing: {e}", file=sys.stderr)
164
  import traceback
165
  traceback.print_exc(file=sys.stderr)