Spaces:
Sleeping
Sleeping
Version 8
Browse files- paddle_ocr_standalone.py +11 -3
paddle_ocr_standalone.py
CHANGED
|
@@ -94,9 +94,16 @@ try:
|
|
| 94 |
# Print progress to stderr (like your local implementation)
|
| 95 |
print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
|
| 96 |
|
| 97 |
-
# Initialize PaddleOCR -
|
| 98 |
# Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
|
| 99 |
-
ocr = PaddleOCR(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
print("PaddleOCR initialized successfully", file=sys.stderr)
|
| 101 |
|
| 102 |
# Check if it's a PDF or image
|
|
@@ -104,7 +111,8 @@ try:
|
|
| 104 |
|
| 105 |
if is_pdf:
|
| 106 |
print("Converting PDF to images for OCR processing...", file=sys.stderr)
|
| 107 |
-
|
|
|
|
| 108 |
temp_files = image_paths
|
| 109 |
|
| 110 |
if not image_paths:
|
|
|
|
| 94 |
# Print progress to stderr (like your local implementation)
|
| 95 |
print(f"Starting OCR processing for: {os.path.basename(file_path)}", file=sys.stderr)
|
| 96 |
|
| 97 |
+
# Initialize PaddleOCR - try different settings for better text detection
|
| 98 |
# Redirect PaddleOCR's stdout to stderr to avoid JSON pollution
|
| 99 |
+
ocr = PaddleOCR(
|
| 100 |
+
use_angle_cls=True,
|
| 101 |
+
lang='en',
|
| 102 |
+
show_log=False,
|
| 103 |
+
det_model_dir=None, # Use default detection model
|
| 104 |
+
rec_model_dir=None, # Use default recognition model
|
| 105 |
+
use_gpu=False # Ensure CPU usage in serverless environment
|
| 106 |
+
)
|
| 107 |
print("PaddleOCR initialized successfully", file=sys.stderr)
|
| 108 |
|
| 109 |
# Check if it's a PDF or image
|
|
|
|
| 111 |
|
| 112 |
if is_pdf:
|
| 113 |
print("Converting PDF to images for OCR processing...", file=sys.stderr)
|
| 114 |
+
# Try lower DPI first to see if it helps
|
| 115 |
+
image_paths = pdf_to_images(file_path, dpi=150) # Reduced from 200
|
| 116 |
temp_files = image_paths
|
| 117 |
|
| 118 |
if not image_paths:
|