mbuckle commited on
Commit
7fa5932
·
1 Parent(s): 9ae0d8b

Version 8

Browse files
Files changed (1) hide show
  1. paddle_ocr_standalone.py +11 -3
paddle_ocr_standalone.py CHANGED
@@ -94,9 +94,16 @@ try:
94
  # Print progress to stderr (like your local implementation)
95
  print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
96
 
97
- # Initialize PaddleOCR - exactly like your local implementation
98
  # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
99
- ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
 
 
 
 
 
 
 
100
  print("PaddleOCR initialized successfully", file=sys.stderr)
101
 
102
  # Check if it's a PDF or image
@@ -104,7 +111,8 @@ try:
104
 
105
  if is_pdf:
106
  print("Converting PDF to images for OCR processing...", file=sys.stderr)
107
- image_paths = pdf_to_images(file_path)
 
108
  temp_files = image_paths
109
 
110
  if not image_paths:
 
94
  # Print progress to stderr (like your local implementation)
95
  print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
96
 
97
+ # Initialize PaddleOCR - try different settings for better text detection
98
  # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
99
+ ocr = PaddleOCR(
100
+ use_angle_cls=True,
101
+ lang='en',
102
+ show_log=False,
103
+ det_model_dir=None, # Use default detection model
104
+ rec_model_dir=None, # Use default recognition model
105
+ use_gpu=False # Ensure CPU usage in serverless environment
106
+ )
107
  print("PaddleOCR initialized successfully", file=sys.stderr)
108
 
109
  # Check if it's a PDF or image
 
111
 
112
  if is_pdf:
113
  print("Converting PDF to images for OCR processing...", file=sys.stderr)
114
+ # Try lower DPI first to see if it helps
115
+ image_paths = pdf_to_images(file_path, dpi=150) # Reduced from 200
116
  temp_files = image_paths
117
 
118
  if not image_paths: