""" Modern ID Card Detector using YOLOv11 (Ultralytics) or D-FINE. Replaces: TFLite EfficientNet (axis-aligned, no rotation awareness) Upgrade path: YOLOv11n → YOLOv11n-OBB → D-FINE-S (fine-tuned) References: - YOLOv11: Ultralytics (2024), 2.6M params, ~2ms on T4 - D-FINE-S: arxiv:2410.13842, 10M params, 3.5ms T4 TRT FP16 - IWPOD-Net: arxiv:2509.06246, 1.8M params, direct 4-corner output """ import numpy as np from dataclasses import dataclass from typing import List, Optional, Tuple import logging logger = logging.getLogger(__name__) @dataclass class Detection: """Single card detection result.""" bbox: np.ndarray # [x1, y1, x2, y2] in pixel coordinates confidence: float # Detection confidence [0, 1] class_id: int = 0 # 0 = id_card corners: Optional[np.ndarray] = None # [4, 2] corner points if available @property def center(self) -> Tuple[float, float]: return ((self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2) @property def width(self) -> float: return self.bbox[2] - self.bbox[0] @property def height(self) -> float: return self.bbox[3] - self.bbox[1] @property def area(self) -> float: return self.width * self.height @property def aspect_ratio(self) -> float: return self.width / max(self.height, 1e-6) def crop_from(self, frame: np.ndarray, padding: float = 0.05) -> np.ndarray: """Extract card crop from frame with optional padding.""" h, w = frame.shape[:2] pad_x = int(self.width * padding) pad_y = int(self.height * padding) x1 = max(0, int(self.bbox[0]) - pad_x) y1 = max(0, int(self.bbox[1]) - pad_y) x2 = min(w, int(self.bbox[2]) + pad_x) y2 = min(h, int(self.bbox[3]) + pad_y) return frame[y1:y2, x1:x2].copy() class IDCardDetector: """ ID card detector using YOLOv11 (default) or D-FINE. Improvements over TFLite EfficientNet: - OBB support for rotated cards (YOLOv11-OBB) - Better small object detection - Built-in NMS - Dynamic input sizes - GPU acceleration - Easy fine-tuning with Ultralytics ecosystem """ def __init__(self, config): """ Args: config: DetectorConfig from enhanced/config.py """ self.config = config self.model = None self._load_model() def _load_model(self): """Load the detection model.""" if self.config.backend.value == "yolo11": self._load_yolo() elif self.config.backend.value == "dfine": self._load_dfine() elif self.config.backend.value == "rtdetr": self._load_rtdetr() else: raise ValueError(f"Unknown detector backend: {self.config.backend}") def _load_yolo(self): """Load YOLOv11 model via Ultralytics.""" try: from ultralytics import YOLO self.model = YOLO(self.config.model_path) logger.info(f"Loaded YOLO model from {self.config.model_path}") except ImportError: logger.warning("ultralytics not installed. Run: pip install ultralytics. Using mock detector.") self.model = None except Exception as e: logger.warning(f"Could not load YOLO model: {e}. Using mock detector.") self.model = None def _load_dfine(self): """Load D-FINE model via ONNX Runtime or custom loader.""" try: import onnxruntime as ort self.model = ort.InferenceSession(self.config.model_path) logger.info(f"Loaded D-FINE ONNX model from {self.config.model_path}") except Exception as e: logger.warning(f"Could not load D-FINE model: {e}. Using mock detector.") self.model = None def _load_rtdetr(self): """Load RT-DETR model via HuggingFace transformers.""" try: from transformers import RTDetrForObjectDetection, RTDetrImageProcessor import torch self.processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") self.model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd") self.model.eval() logger.info("Loaded RT-DETR model from PekingU/rtdetr_r50vd") except Exception as e: logger.warning(f"Could not load RT-DETR: {e}. Using mock detector.") self.model = None def detect(self, frame: np.ndarray) -> List[Detection]: """ Detect ID cards in a frame. Args: frame: BGR image as numpy array (H, W, 3) Returns: List of Detection objects """ if self.model is None: return self._mock_detect(frame) if self.config.backend.value == "yolo11": return self._detect_yolo(frame) elif self.config.backend.value == "rtdetr": return self._detect_rtdetr(frame) else: return self._mock_detect(frame) def _detect_yolo(self, frame: np.ndarray) -> List[Detection]: """Run YOLO detection.""" results = self.model( frame, conf=0.30, iou=self.config.nms_iou_threshold, imgsz=640, verbose=False, ) detections = [] for r in results: if r.boxes is None: continue for box in r.boxes: bbox = box.xyxy[0].cpu().numpy() conf = float(box.conf[0]) cls_id = int(box.cls[0]) det = Detection(bbox=bbox, confidence=conf, class_id=cls_id) # Log de debug pour voir ce que YOLO trouve avant le filtre géométrique print(f"--- [DEBUG YOLO] Carte trouvée avec confiance: {conf:.2f} | Bbox: {bbox}") # Apply aspect ratio and area filters if self._validate_detection(det, frame.shape): detections.append(det) else: print(f"--- [DEBUG FILTER] Carte REJETÉE par les filtres géométriques (Taille/Ratio)") print(f"--- [DEBUG FINAL] Nombre de cartes retenues après filtres : {len(detections)}") return detections def _detect_rtdetr(self, frame: np.ndarray) -> List[Detection]: """Run RT-DETR detection via HuggingFace transformers.""" import torch from PIL import Image image = Image.fromarray(frame[:, :, ::-1]) # BGR to RGB inputs = self.processor(images=image, return_tensors="pt") with torch.no_grad(): outputs = self.model(**inputs) target_sizes = torch.tensor([(frame.shape[0], frame.shape[1])]) results = self.processor.post_process_object_detection( outputs, target_sizes=target_sizes, threshold=self.config.confidence_threshold ) detections = [] for result in results: for score, label, box in zip(result["scores"], result["labels"], result["boxes"]): bbox = box.cpu().numpy() det = Detection( bbox=bbox, confidence=float(score), class_id=int(label), ) if self._validate_detection(det, frame.shape): detections.append(det) return detections def _validate_detection(self, det: Detection, frame_shape: Tuple[int, ...]) -> bool: """ Validate a detection using geometric constraints. More permissive than old system (old: 1.2-2.0 aspect ratio). Now handles perspective-distorted cards (apparent aspect ratio varies widely). """ h, w = frame_shape[:2] frame_area = h * w if det.width < 40 or det.height < 40: print(f" -> Échec: Carte trop petite ({det.width:.1f}x{det.height:.1f})") return False # Area check area_ratio = det.area / frame_area if area_ratio < 0.05: print(f" -> Échec: La carte occupe trop peu de place sur la photo ({area_ratio*100:.1f}%)") return False # Aspect ratio check (more permissive for tilted/perspective cards) ar = det.aspect_ratio inv_ar = 1.0 / max(ar, 1e-6) effective_ar = max(ar, inv_ar) # Handle both landscape and portrait print(f" -> Stats géométriques : Aspect Ratio effectif = {effective_ar:.2f} | Ratio Aire = {area_ratio*100:.1f}%") if effective_ar < 1.0 or effective_ar > 2.3: print(f" -> Échec: Aspect Ratio hors limites ({effective_ar:.2f})") return False return True def _mock_detect(self, frame: np.ndarray) -> List[Detection]: """ Mock detector for testing without a real model. Generates a reasonable detection in the center of the frame. """ h, w = frame.shape[:2] # Simulate a card detection in the center card_w = w * 0.4 card_h = card_w / 1.585 # ISO/IEC 7810 ratio cx, cy = w / 2, h / 2 return [Detection( bbox=np.array([cx - card_w/2, cy - card_h/2, cx + card_w/2, cy + card_h/2]), confidence=0.85, class_id=0, )]