CU1-X / detection /ocr_handler.py
AI-DrivenTesting's picture
init
77da9e2
"""
OCR Handler - OCR-only Processing
This module provides OCR-only functionality that bypasses the full detection pipeline.
Useful for cases where you only need text extraction without RF-DETR/CLIP analysis.
"""
import torch
import cv2
import numpy as np
from PIL import Image
from typing import Union, List, Dict, Tuple
from pathlib import Path
import easyocr
from detection.image_utils import load_image
def process_ocr_only(
image: Union[str, Path, np.ndarray, Image.Image],
gpu: bool = None
) -> List[Dict]:
"""
Run OCR across the full image and return detections
This bypasses RF-DETR/CLIP and runs EasyOCR directly on the image.
Args:
image: Input image (path, PIL Image, or numpy array)
gpu: Whether to use GPU. If None, auto-detects CUDA availability.
Returns:
List of detections with keys:
- box: Dict with x1, y1, x2, y2 coordinates
- confidence: OCR confidence score (float)
- class_id: None (no classification)
- class_name: "" (no classification)
- text: Extracted text string
- description: "" (no description)
"""
# Load image
img_array = load_image(image)
# Initialize OCR reader
if gpu is None:
gpu = torch.cuda.is_available()
reader = easyocr.Reader(['en', 'fr'], gpu=gpu)
# Run OCR - detail=1 returns [ [ (x,y)...4 points ], text, conf ]
ocr_results = reader.readtext(img_array, detail=1)
# Convert to standard detection format
detections = []
for entry in ocr_results:
if not isinstance(entry, (list, tuple)) or len(entry) < 3:
continue
quad, text, conf = entry[0], entry[1], entry[2]
if not isinstance(text, str) or not text.strip():
continue
# Convert quadrilateral to bounding box
xs = [p[0] for p in quad]
ys = [p[1] for p in quad]
box = {
"x1": float(int(min(xs))),
"y1": float(int(min(ys))),
"x2": float(int(max(xs))),
"y2": float(int(max(ys)))
}
detections.append({
"box": box,
"confidence": float(conf) if conf is not None else 1.0,
"class_id": None,
"class_name": "",
"text": text.strip(),
"description": ""
})
return detections
def annotate_ocr_detections(
image: Union[str, Path, np.ndarray, Image.Image],
detections: List[Dict],
thickness: int = 2,
return_format: str = "pil"
) -> Union[Image.Image, np.ndarray]:
"""
Annotate image with OCR detection boxes and text labels
Args:
image: Input image (path, PIL Image, or numpy array)
detections: List of detections from process_ocr_only()
thickness: Line thickness for bounding boxes
return_format: "pil" for PIL Image or "numpy" for numpy array
Returns:
Annotated image as PIL Image or numpy array
"""
# Load image
img_array = load_image(image)
# Convert to BGR for OpenCV
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
# Draw each detection
for det in detections:
x1 = int(det["box"]["x1"])
y1 = int(det["box"]["y1"])
x2 = int(det["box"]["x2"])
y2 = int(det["box"]["y2"])
# Draw bounding box
cv2.rectangle(img_bgr, (x1, y1), (x2, y2), (0, 255, 0), thickness)
# Draw text label
text = det.get("text", "")
if text:
(tw, th), bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
ty = max(y1 - 10, th + 10)
# Draw text background
cv2.rectangle(
img_bgr,
(x1, ty - th - bl - 4),
(x1 + tw + 6, ty + bl - 4),
(0, 180, 0), # Darker green
-1
)
# Draw text
cv2.putText(
img_bgr,
text,
(x1 + 3, ty - bl - 2),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 255, 255),
1
)
# Convert back to RGB
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
# Return in requested format
if return_format.lower() == "pil":
return Image.fromarray(img_rgb)
else:
return img_rgb