Spaces:

mbuck17
/

paddleocr-processor

Sleeping

File size: 3,500 Bytes

#!/usr/bin/env python3
# minimal_test_paddle.py - Minimal test to isolate the OCR issue

import sys
import os
import json
import fitz
from paddleocr import PaddleOCR

def test_ocr():
    if len(sys.argv) < 2:
        print(json.dumps({"error": "No file path provided"}))
        return
    
    file_path = sys.argv[1]
    
    try:
        print(f"Testing OCR on: {file_path}", file=sys.stderr)
        
        # Test 1: Can we open the PDF?
        print("Opening PDF...", file=sys.stderr)
        doc = fitz.open(file_path)
        print(f"PDF has {len(doc)} pages", file=sys.stderr)
        
        # Test 2: Convert first page to image
        print("Converting first page to image...", file=sys.stderr)
        page = doc[0]
        mat = fitz.Matrix(150/72, 150/72)
        pix = page.get_pixmap(matrix=mat)
        
        temp_img = "/tmp/test_page.png"
        pix.save(temp_img)
        
        if os.path.exists(temp_img):
            img_size = os.path.getsize(temp_img)
            print(f"Image created: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
        else:
            print("Failed to create image", file=sys.stderr)
            doc.close()
            return
        
        doc.close()
        
        # Test 3: Initialize OCR
        print("Initializing OCR...", file=sys.stderr)
        ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
        print("OCR initialized", file=sys.stderr)
        
        # Test 4: Run OCR on the image
        print("Running OCR...", file=sys.stderr)
        result = ocr.ocr(temp_img, cls=True)
        
        print(f"OCR result type: {type(result)}", file=sys.stderr)
        if result:
            print(f"Result length: {len(result)}", file=sys.stderr)
            if result[0]:
                print(f"First page has {len(result[0])} detections", file=sys.stderr)
                
                # Print all detected text
                for i, detection in enumerate(result[0]):
                    if len(detection) >= 2:
                        text = detection[1][0] if isinstance(detection[1], (list, tuple)) else str(detection[1])
                        conf = detection[1][1] if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
                        print(f"Detection {i}: '{text}' (confidence: {conf})", file=sys.stderr)
            else:
                print("First page result is empty", file=sys.stderr)
        else:
            print("OCR returned None", file=sys.stderr)
        
        # Clean up
        if os.path.exists(temp_img):
            os.unlink(temp_img)
        
        # Return simple result
        text_found = ""
        if result and result[0]:
            for detection in result[0]:
                if len(detection) >= 2:
                    # Convert to string to handle both string and float values
                    text_value = str(detection[1][0]) if isinstance(detection[1], (list, tuple)) else str(detection[1])
                    text_found += text_value + "\n"
        
        print(json.dumps({
            "success": True,
            "text": text_found,
            "detections": len(result[0]) if result and result[0] else 0
        }))
        
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        import traceback
        traceback.print_exc(file=sys.stderr)
        print(json.dumps({"success": False, "error": str(e)}))

if __name__ == "__main__":
    test_ocr()