Spaces:
Running
Running
| """ | |
| Modern ID Card Detector using YOLOv11 (Ultralytics) or D-FINE. | |
| Replaces: TFLite EfficientNet (axis-aligned, no rotation awareness) | |
| Upgrade path: YOLOv11n → YOLOv11n-OBB → D-FINE-S (fine-tuned) | |
| References: | |
| - YOLOv11: Ultralytics (2024), 2.6M params, ~2ms on T4 | |
| - D-FINE-S: arxiv:2410.13842, 10M params, 3.5ms T4 TRT FP16 | |
| - IWPOD-Net: arxiv:2509.06246, 1.8M params, direct 4-corner output | |
| """ | |
| import numpy as np | |
| from dataclasses import dataclass | |
| from typing import List, Optional, Tuple | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| class Detection: | |
| """Single card detection result.""" | |
| bbox: np.ndarray # [x1, y1, x2, y2] in pixel coordinates | |
| confidence: float # Detection confidence [0, 1] | |
| class_id: int = 0 # 0 = id_card | |
| corners: Optional[np.ndarray] = None # [4, 2] corner points if available | |
| def center(self) -> Tuple[float, float]: | |
| return ((self.bbox[0] + self.bbox[2]) / 2, (self.bbox[1] + self.bbox[3]) / 2) | |
| def width(self) -> float: | |
| return self.bbox[2] - self.bbox[0] | |
| def height(self) -> float: | |
| return self.bbox[3] - self.bbox[1] | |
| def area(self) -> float: | |
| return self.width * self.height | |
| def aspect_ratio(self) -> float: | |
| return self.width / max(self.height, 1e-6) | |
| def crop_from(self, frame: np.ndarray, padding: float = 0.05) -> np.ndarray: | |
| """Extract card crop from frame with optional padding.""" | |
| h, w = frame.shape[:2] | |
| pad_x = int(self.width * padding) | |
| pad_y = int(self.height * padding) | |
| x1 = max(0, int(self.bbox[0]) - pad_x) | |
| y1 = max(0, int(self.bbox[1]) - pad_y) | |
| x2 = min(w, int(self.bbox[2]) + pad_x) | |
| y2 = min(h, int(self.bbox[3]) + pad_y) | |
| return frame[y1:y2, x1:x2].copy() | |
| class IDCardDetector: | |
| """ | |
| ID card detector using YOLOv11 (default) or D-FINE. | |
| Improvements over TFLite EfficientNet: | |
| - OBB support for rotated cards (YOLOv11-OBB) | |
| - Better small object detection | |
| - Built-in NMS | |
| - Dynamic input sizes | |
| - GPU acceleration | |
| - Easy fine-tuning with Ultralytics ecosystem | |
| """ | |
| def __init__(self, config): | |
| """ | |
| Args: | |
| config: DetectorConfig from enhanced/config.py | |
| """ | |
| self.config = config | |
| self.model = None | |
| self._load_model() | |
| def _load_model(self): | |
| """Load the detection model.""" | |
| if self.config.backend.value == "yolo11": | |
| self._load_yolo() | |
| elif self.config.backend.value == "dfine": | |
| self._load_dfine() | |
| elif self.config.backend.value == "rtdetr": | |
| self._load_rtdetr() | |
| else: | |
| raise ValueError(f"Unknown detector backend: {self.config.backend}") | |
| def _load_yolo(self): | |
| """Load YOLOv11 model via Ultralytics.""" | |
| try: | |
| from ultralytics import YOLO | |
| self.model = YOLO(self.config.model_path) | |
| logger.info(f"Loaded YOLO model from {self.config.model_path}") | |
| except ImportError: | |
| logger.warning("ultralytics not installed. Run: pip install ultralytics. Using mock detector.") | |
| self.model = None | |
| except Exception as e: | |
| logger.warning(f"Could not load YOLO model: {e}. Using mock detector.") | |
| self.model = None | |
| def _load_dfine(self): | |
| """Load D-FINE model via ONNX Runtime or custom loader.""" | |
| try: | |
| import onnxruntime as ort | |
| self.model = ort.InferenceSession(self.config.model_path) | |
| logger.info(f"Loaded D-FINE ONNX model from {self.config.model_path}") | |
| except Exception as e: | |
| logger.warning(f"Could not load D-FINE model: {e}. Using mock detector.") | |
| self.model = None | |
| def _load_rtdetr(self): | |
| """Load RT-DETR model via HuggingFace transformers.""" | |
| try: | |
| from transformers import RTDetrForObjectDetection, RTDetrImageProcessor | |
| import torch | |
| self.processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd") | |
| self.model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd") | |
| self.model.eval() | |
| logger.info("Loaded RT-DETR model from PekingU/rtdetr_r50vd") | |
| except Exception as e: | |
| logger.warning(f"Could not load RT-DETR: {e}. Using mock detector.") | |
| self.model = None | |
| def detect(self, frame: np.ndarray) -> List[Detection]: | |
| """ | |
| Detect ID cards in a frame. | |
| Args: | |
| frame: BGR image as numpy array (H, W, 3) | |
| Returns: | |
| List of Detection objects | |
| """ | |
| if self.model is None: | |
| return self._mock_detect(frame) | |
| if self.config.backend.value == "yolo11": | |
| return self._detect_yolo(frame) | |
| elif self.config.backend.value == "rtdetr": | |
| return self._detect_rtdetr(frame) | |
| else: | |
| return self._mock_detect(frame) | |
| def _detect_yolo(self, frame: np.ndarray) -> List[Detection]: | |
| """Run YOLO detection.""" | |
| results = self.model( | |
| frame, | |
| conf=0.30, | |
| iou=self.config.nms_iou_threshold, | |
| imgsz=640, | |
| verbose=False, | |
| ) | |
| detections = [] | |
| for r in results: | |
| if r.boxes is None: | |
| continue | |
| for box in r.boxes: | |
| bbox = box.xyxy[0].cpu().numpy() | |
| conf = float(box.conf[0]) | |
| cls_id = int(box.cls[0]) | |
| det = Detection(bbox=bbox, confidence=conf, class_id=cls_id) | |
| # Log de debug pour voir ce que YOLO trouve avant le filtre géométrique | |
| print(f"--- [DEBUG YOLO] Carte trouvée avec confiance: {conf:.2f} | Bbox: {bbox}") | |
| # Apply aspect ratio and area filters | |
| if self._validate_detection(det, frame.shape): | |
| detections.append(det) | |
| else: | |
| print(f"--- [DEBUG FILTER] Carte REJETÉE par les filtres géométriques (Taille/Ratio)") | |
| print(f"--- [DEBUG FINAL] Nombre de cartes retenues après filtres : {len(detections)}") | |
| return detections | |
| def _detect_rtdetr(self, frame: np.ndarray) -> List[Detection]: | |
| """Run RT-DETR detection via HuggingFace transformers.""" | |
| import torch | |
| from PIL import Image | |
| image = Image.fromarray(frame[:, :, ::-1]) # BGR to RGB | |
| inputs = self.processor(images=image, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = self.model(**inputs) | |
| target_sizes = torch.tensor([(frame.shape[0], frame.shape[1])]) | |
| results = self.processor.post_process_object_detection( | |
| outputs, target_sizes=target_sizes, threshold=self.config.confidence_threshold | |
| ) | |
| detections = [] | |
| for result in results: | |
| for score, label, box in zip(result["scores"], result["labels"], result["boxes"]): | |
| bbox = box.cpu().numpy() | |
| det = Detection( | |
| bbox=bbox, | |
| confidence=float(score), | |
| class_id=int(label), | |
| ) | |
| if self._validate_detection(det, frame.shape): | |
| detections.append(det) | |
| return detections | |
| def _validate_detection(self, det: Detection, frame_shape: Tuple[int, ...]) -> bool: | |
| """ | |
| Validate a detection using geometric constraints. | |
| More permissive than old system (old: 1.2-2.0 aspect ratio). | |
| Now handles perspective-distorted cards (apparent aspect ratio varies widely). | |
| """ | |
| h, w = frame_shape[:2] | |
| frame_area = h * w | |
| if det.width < 40 or det.height < 40: | |
| print(f" -> Échec: Carte trop petite ({det.width:.1f}x{det.height:.1f})") | |
| return False | |
| # Area check | |
| area_ratio = det.area / frame_area | |
| if area_ratio < 0.05: | |
| print(f" -> Échec: La carte occupe trop peu de place sur la photo ({area_ratio*100:.1f}%)") | |
| return False | |
| # Aspect ratio check (more permissive for tilted/perspective cards) | |
| ar = det.aspect_ratio | |
| inv_ar = 1.0 / max(ar, 1e-6) | |
| effective_ar = max(ar, inv_ar) # Handle both landscape and portrait | |
| print(f" -> Stats géométriques : Aspect Ratio effectif = {effective_ar:.2f} | Ratio Aire = {area_ratio*100:.1f}%") | |
| if effective_ar < 1.0 or effective_ar > 2.3: | |
| print(f" -> Échec: Aspect Ratio hors limites ({effective_ar:.2f})") | |
| return False | |
| return True | |
| def _mock_detect(self, frame: np.ndarray) -> List[Detection]: | |
| """ | |
| Mock detector for testing without a real model. | |
| Generates a reasonable detection in the center of the frame. | |
| """ | |
| h, w = frame.shape[:2] | |
| # Simulate a card detection in the center | |
| card_w = w * 0.4 | |
| card_h = card_w / 1.585 # ISO/IEC 7810 ratio | |
| cx, cy = w / 2, h / 2 | |
| return [Detection( | |
| bbox=np.array([cx - card_w/2, cy - card_h/2, cx + card_w/2, cy + card_h/2]), | |
| confidence=0.85, | |
| class_id=0, | |
| )] | |