Spaces:

AI-DrivenTesting
/

CU1-X

Sleeping

File size: 4,481 Bytes

77da9e2

"""
OCR Handler - OCR-only Processing

This module provides OCR-only functionality that bypasses the full detection pipeline.
Useful for cases where you only need text extraction without RF-DETR/CLIP analysis.
"""

import torch
import cv2
import numpy as np
from PIL import Image
from typing import Union, List, Dict, Tuple
from pathlib import Path
import easyocr

from detection.image_utils import load_image


def process_ocr_only(
    image: Union[str, Path, np.ndarray, Image.Image],
    gpu: bool = None
) -> List[Dict]:
    """
    Run OCR across the full image and return detections
    
    This bypasses RF-DETR/CLIP and runs EasyOCR directly on the image.
    
    Args:
        image: Input image (path, PIL Image, or numpy array)
        gpu: Whether to use GPU. If None, auto-detects CUDA availability.
        
    Returns:
        List of detections with keys:
            - box: Dict with x1, y1, x2, y2 coordinates
            - confidence: OCR confidence score (float)
            - class_id: None (no classification)
            - class_name: "" (no classification)
            - text: Extracted text string
            - description: "" (no description)
    """
    # Load image
    img_array = load_image(image)
    
    # Initialize OCR reader
    if gpu is None:
        gpu = torch.cuda.is_available()
    reader = easyocr.Reader(['en', 'fr'], gpu=gpu)
    
    # Run OCR - detail=1 returns [ [ (x,y)...4 points ], text, conf ]
    ocr_results = reader.readtext(img_array, detail=1)
    
    # Convert to standard detection format
    detections = []
    for entry in ocr_results:
        if not isinstance(entry, (list, tuple)) or len(entry) < 3:
            continue
        quad, text, conf = entry[0], entry[1], entry[2]
        if not isinstance(text, str) or not text.strip():
            continue
        
        # Convert quadrilateral to bounding box
        xs = [p[0] for p in quad]
        ys = [p[1] for p in quad]
        box = {
            "x1": float(int(min(xs))),
            "y1": float(int(min(ys))),
            "x2": float(int(max(xs))),
            "y2": float(int(max(ys)))
        }
        
        detections.append({
            "box": box,
            "confidence": float(conf) if conf is not None else 1.0,
            "class_id": None,
            "class_name": "",
            "text": text.strip(),
            "description": ""
        })
    
    return detections


def annotate_ocr_detections(
    image: Union[str, Path, np.ndarray, Image.Image],
    detections: List[Dict],
    thickness: int = 2,
    return_format: str = "pil"
) -> Union[Image.Image, np.ndarray]:
    """
    Annotate image with OCR detection boxes and text labels
    
    Args:
        image: Input image (path, PIL Image, or numpy array)
        detections: List of detections from process_ocr_only()
        thickness: Line thickness for bounding boxes
        return_format: "pil" for PIL Image or "numpy" for numpy array
        
    Returns:
        Annotated image as PIL Image or numpy array
    """
    # Load image
    img_array = load_image(image)
    
    # Convert to BGR for OpenCV
    img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
    
    # Draw each detection
    for det in detections:
        x1 = int(det["box"]["x1"])
        y1 = int(det["box"]["y1"])
        x2 = int(det["box"]["x2"])
        y2 = int(det["box"]["y2"])
        
        # Draw bounding box
        cv2.rectangle(img_bgr, (x1, y1), (x2, y2), (0, 255, 0), thickness)
        
        # Draw text label
        text = det.get("text", "")
        if text:
            (tw, th), bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
            ty = max(y1 - 10, th + 10)
            
            # Draw text background
            cv2.rectangle(
                img_bgr,
                (x1, ty - th - bl - 4),
                (x1 + tw + 6, ty + bl - 4),
                (0, 180, 0),  # Darker green
                -1
            )
            
            # Draw text
            cv2.putText(
                img_bgr,
                text,
                (x1 + 3, ty - bl - 2),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (255, 255, 255),
                1
            )
    
    # Convert back to RGB
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    
    # Return in requested format
    if return_format.lower() == "pil":
        return Image.fromarray(img_rgb)
    else:
        return img_rgb