""" OCR Handler - OCR-only Processing This module provides OCR-only functionality that bypasses the full detection pipeline. Useful for cases where you only need text extraction without RF-DETR/CLIP analysis. """ import torch import cv2 import numpy as np from PIL import Image from typing import Union, List, Dict, Tuple from pathlib import Path import easyocr from detection.image_utils import load_image def process_ocr_only( image: Union[str, Path, np.ndarray, Image.Image], gpu: bool = None ) -> List[Dict]: """ Run OCR across the full image and return detections This bypasses RF-DETR/CLIP and runs EasyOCR directly on the image. Args: image: Input image (path, PIL Image, or numpy array) gpu: Whether to use GPU. If None, auto-detects CUDA availability. Returns: List of detections with keys: - box: Dict with x1, y1, x2, y2 coordinates - confidence: OCR confidence score (float) - class_id: None (no classification) - class_name: "" (no classification) - text: Extracted text string - description: "" (no description) """ # Load image img_array = load_image(image) # Initialize OCR reader if gpu is None: gpu = torch.cuda.is_available() reader = easyocr.Reader(['en', 'fr'], gpu=gpu) # Run OCR - detail=1 returns [ [ (x,y)...4 points ], text, conf ] ocr_results = reader.readtext(img_array, detail=1) # Convert to standard detection format detections = [] for entry in ocr_results: if not isinstance(entry, (list, tuple)) or len(entry) < 3: continue quad, text, conf = entry[0], entry[1], entry[2] if not isinstance(text, str) or not text.strip(): continue # Convert quadrilateral to bounding box xs = [p[0] for p in quad] ys = [p[1] for p in quad] box = { "x1": float(int(min(xs))), "y1": float(int(min(ys))), "x2": float(int(max(xs))), "y2": float(int(max(ys))) } detections.append({ "box": box, "confidence": float(conf) if conf is not None else 1.0, "class_id": None, "class_name": "", "text": text.strip(), "description": "" }) return detections def annotate_ocr_detections( image: Union[str, Path, np.ndarray, Image.Image], detections: List[Dict], thickness: int = 2, return_format: str = "pil" ) -> Union[Image.Image, np.ndarray]: """ Annotate image with OCR detection boxes and text labels Args: image: Input image (path, PIL Image, or numpy array) detections: List of detections from process_ocr_only() thickness: Line thickness for bounding boxes return_format: "pil" for PIL Image or "numpy" for numpy array Returns: Annotated image as PIL Image or numpy array """ # Load image img_array = load_image(image) # Convert to BGR for OpenCV img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR) # Draw each detection for det in detections: x1 = int(det["box"]["x1"]) y1 = int(det["box"]["y1"]) x2 = int(det["box"]["x2"]) y2 = int(det["box"]["y2"]) # Draw bounding box cv2.rectangle(img_bgr, (x1, y1), (x2, y2), (0, 255, 0), thickness) # Draw text label text = det.get("text", "") if text: (tw, th), bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) ty = max(y1 - 10, th + 10) # Draw text background cv2.rectangle( img_bgr, (x1, ty - th - bl - 4), (x1 + tw + 6, ty + bl - 4), (0, 180, 0), # Darker green -1 ) # Draw text cv2.putText( img_bgr, text, (x1 + 3, ty - bl - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1 ) # Convert back to RGB img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) # Return in requested format if return_format.lower() == "pil": return Image.fromarray(img_rgb) else: return img_rgb