Spaces:
Sleeping
Sleeping
File size: 4,481 Bytes
77da9e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
"""
OCR Handler - OCR-only Processing
This module provides OCR-only functionality that bypasses the full detection pipeline.
Useful for cases where you only need text extraction without RF-DETR/CLIP analysis.
"""
import torch
import cv2
import numpy as np
from PIL import Image
from typing import Union, List, Dict, Tuple
from pathlib import Path
import easyocr
from detection.image_utils import load_image
def process_ocr_only(
image: Union[str, Path, np.ndarray, Image.Image],
gpu: bool = None
) -> List[Dict]:
"""
Run OCR across the full image and return detections
This bypasses RF-DETR/CLIP and runs EasyOCR directly on the image.
Args:
image: Input image (path, PIL Image, or numpy array)
gpu: Whether to use GPU. If None, auto-detects CUDA availability.
Returns:
List of detections with keys:
- box: Dict with x1, y1, x2, y2 coordinates
- confidence: OCR confidence score (float)
- class_id: None (no classification)
- class_name: "" (no classification)
- text: Extracted text string
- description: "" (no description)
"""
# Load image
img_array = load_image(image)
# Initialize OCR reader
if gpu is None:
gpu = torch.cuda.is_available()
reader = easyocr.Reader(['en', 'fr'], gpu=gpu)
# Run OCR - detail=1 returns [ [ (x,y)...4 points ], text, conf ]
ocr_results = reader.readtext(img_array, detail=1)
# Convert to standard detection format
detections = []
for entry in ocr_results:
if not isinstance(entry, (list, tuple)) or len(entry) < 3:
continue
quad, text, conf = entry[0], entry[1], entry[2]
if not isinstance(text, str) or not text.strip():
continue
# Convert quadrilateral to bounding box
xs = [p[0] for p in quad]
ys = [p[1] for p in quad]
box = {
"x1": float(int(min(xs))),
"y1": float(int(min(ys))),
"x2": float(int(max(xs))),
"y2": float(int(max(ys)))
}
detections.append({
"box": box,
"confidence": float(conf) if conf is not None else 1.0,
"class_id": None,
"class_name": "",
"text": text.strip(),
"description": ""
})
return detections
def annotate_ocr_detections(
image: Union[str, Path, np.ndarray, Image.Image],
detections: List[Dict],
thickness: int = 2,
return_format: str = "pil"
) -> Union[Image.Image, np.ndarray]:
"""
Annotate image with OCR detection boxes and text labels
Args:
image: Input image (path, PIL Image, or numpy array)
detections: List of detections from process_ocr_only()
thickness: Line thickness for bounding boxes
return_format: "pil" for PIL Image or "numpy" for numpy array
Returns:
Annotated image as PIL Image or numpy array
"""
# Load image
img_array = load_image(image)
# Convert to BGR for OpenCV
img_bgr = cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)
# Draw each detection
for det in detections:
x1 = int(det["box"]["x1"])
y1 = int(det["box"]["y1"])
x2 = int(det["box"]["x2"])
y2 = int(det["box"]["y2"])
# Draw bounding box
cv2.rectangle(img_bgr, (x1, y1), (x2, y2), (0, 255, 0), thickness)
# Draw text label
text = det.get("text", "")
if text:
(tw, th), bl = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
ty = max(y1 - 10, th + 10)
# Draw text background
cv2.rectangle(
img_bgr,
(x1, ty - th - bl - 4),
(x1 + tw + 6, ty + bl - 4),
(0, 180, 0), # Darker green
-1
)
# Draw text
cv2.putText(
img_bgr,
text,
(x1 + 3, ty - bl - 2),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 255, 255),
1
)
# Convert back to RGB
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
# Return in requested format
if return_format.lower() == "pil":
return Image.fromarray(img_rgb)
else:
return img_rgb
|