Spaces:

mbuck17
/

paddleocr-processor

Sleeping

mbuckle commited on Jun 3, 2025

Commit

7fa5932

1 Parent(s): 9ae0d8b

Version 8

Files changed (1) hide show

paddle_ocr_standalone.py CHANGED Viewed

@@ -94,9 +94,16 @@ try:
     # Print progress to stderr (like your local implementation)
     print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
-    # Initialize PaddleOCR - exactly like your local implementation
     # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
-    ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
     print("PaddleOCR initialized successfully", file=sys.stderr)
     # Check if it's a PDF or image
@@ -104,7 +111,8 @@ try:
     if is_pdf:
         print("Converting PDF to images for OCR processing...", file=sys.stderr)
-        image_paths = pdf_to_images(file_path)
         temp_files = image_paths
         if not image_paths:

     # Print progress to stderr (like your local implementation)
     print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
+    # Initialize PaddleOCR - try different settings for better text detection
     # Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
+    ocr = PaddleOCR(
+        use_angle_cls=True,
+        lang='en',
+        show_log=False,
+        det_model_dir=None,  # Use default detection model
+        rec_model_dir=None,  # Use default recognition model
+        use_gpu=False        # Ensure CPU usage in serverless environment
+    )
     print("PaddleOCR initialized successfully", file=sys.stderr)
     # Check if it's a PDF or image
     if is_pdf:
         print("Converting PDF to images for OCR processing...", file=sys.stderr)
+        # Try lower DPI first to see if it helps
+        image_paths = pdf_to_images(file_path, dpi=150)  # Reduced from 200
         temp_files = image_paths
         if not image_paths: