Spaces:

VietCat
/

TrafficSignDetector

Sleeping

VietCat commited on Dec 29, 2025

Commit

f489532

1 Parent(s): f98ab8d

Implement tiling strategy for high-resolution object detection

- Add _create_tiles() to split image into overlapping tiles (20% overlap)
- Add _select_standard_size() to choose nearest standard size (640/960/1024)
- Add _resize_to_standard() for letterbox preprocessing of tiles
- Add _merge_detections() to deduplicate detections from overlapping regions using NMS
- Refactor detect() method to process each tile separately then merge results
- Transform bounding boxes from tile space back to original image space
- Ensures maximum input resolution while maintaining accuracy

Files changed (1) hide show

model.py +245 -130

model.py CHANGED Viewed

@@ -5,6 +5,7 @@ import yaml
 from huggingface_hub import hf_hub_download
 import os
 import torch
 class TrafficSignDetector:
     def __init__(self, config_path):
@@ -106,17 +107,87 @@ class TrafficSignDetector:
         print("="*80 + "\n")
-    def _ensure_square(self, image, target_size=640):
         """
-        Adjust image to square while maintaining aspect ratio.
-        - If image is smaller: pad to target_size x target_size
-        - If image is larger: resize down to target_size x target_size
-        Letterbox padding is added to preserve aspect ratio.
         :param image: input image (numpy array)
-        :param target_size: target size (default 640x640)
-        :return: square image (target_size x target_size)
         """
         height, width = image.shape[:2]
         max_dim = max(width, height)
         # Scale to fit target while maintaining aspect ratio
@@ -127,18 +198,23 @@ class TrafficSignDetector:
         new_height = int(height * scale)
         # Resize image
-        resized = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
-        # Create canvas and place resized image
         canvas = np.full((target_size, target_size, 3), (114, 114, 114), dtype=np.uint8)
         pad_x = (target_size - new_width) // 2
         pad_y = (target_size - new_height) // 2
         canvas[pad_y:pad_y + new_height, pad_x:pad_x + new_width] = resized
-        print(f"Original: {image.shape} → Scale: {scale:.3f} → Resized: {resized.shape} → Final: {canvas.shape}")
         return canvas, scale, pad_x, pad_y
     def _preprocess(self, image):
         """
         Preprocess image: keep uint8 format as YOLO expects.
@@ -149,9 +225,67 @@ class TrafficSignDetector:
         print(f"Image format: {image.dtype}, Min: {image.min()}, Max: {image.max()}, Mean: {image.mean():.1f}")
         return image
     def detect(self, image, confidence_threshold=None):
         """
-        Perform inference on the image and draw bounding boxes.
         :param image: numpy array of the image
         :param confidence_threshold: optional override for confidence threshold
         :return: tuple of (image with drawn bounding boxes, preprocessed image for visualization)
@@ -161,143 +295,124 @@ class TrafficSignDetector:
             confidence_threshold = self.conf_threshold
         else:
             confidence_threshold = float(confidence_threshold)
         print(f"\n{'='*80}")
-        print(f"DETECTION PIPELINE START")
         print(f"{'='*80}")
         print(f"[STEP 1] INPUT IMAGE")
         print(f"  - Shape: {image.shape}")
         print(f"  - dtype: {image.dtype}")
         print(f"  - Range: [{image.min()}, {image.max()}]")
-        print(f"  - Mean: {image.mean():.2f}, Std: {image.std():.2f}")
-        # Store original image for drawing (uint8)
         original_image = image.copy()
-        # Apply letterbox preprocessing to ensure 640x640 matching training size
-        # Returns both processed image and transformation info
-        print(f"\n[STEP 2] LETTERBOX PREPROCESSING")
-        image, scale, pad_x, pad_y = self._ensure_square(image, target_size=640)
-        print(f"  - Letterboxed shape: {image.shape}")
-        print(f"  - Scale factor: {scale:.3f}")
-        print(f"  - Padding X: {pad_x}, Y: {pad_y}")
-        # Warning if scale is too small (objects might be too small to detect)
-        if scale < 0.5:
-            print(f"  ⚠️  WARNING: Scale factor < 0.5 - objects may be too small!")
-            print(f"     Original size: {original_image.shape[:2]} → Resized: {int(original_image.shape[1]*scale)}x{int(original_image.shape[0]*scale)}")
-        # Normalize pixel values for inference
-        print(f"\n[STEP 3] IMAGE NORMALIZATION")
-        image = self._preprocess(image)
-        # Store preprocessed image for visualization (convert back to 0-255 for display)
-        preprocessed_display = (image * 255).astype(np.uint8) if image.max() <= 1.0 else image.copy()
-        # Use imgsz=640 to match training size
-        # Use iou_threshold for NMS (Non-Maximum Suppression) to remove overlapping boxes
-        print(f"\n[STEP 4] MODEL INFERENCE")
-        print(f"  - Input shape to model: {image.shape}")
-        print(f"  - Confidence threshold: {confidence_threshold}")
-        print(f"  - IOU threshold: 0.55")
-        # Run with conf=0.0 to get raw predictions (before filtering)
-        results_raw = self.model(image, conf=0.0, imgsz=640, iou=0.55)
-        raw_box_count = len(results_raw[0].boxes) if results_raw else 0
-        print(f"  - Raw detections (conf=0.0): {raw_box_count}")
-        if results_raw and len(results_raw[0].boxes) > 0:
-            all_raw_confs = [float(box.conf[0]) for box in results_raw[0].boxes]
-            # Get top 5 with class names
-            boxes_with_conf = [(float(box.conf[0]), int(box.cls[0].cpu().numpy())) for box in results_raw[0].boxes]
-            top_5 = sorted(boxes_with_conf, key=lambda x: x[0], reverse=True)[:5]
-            top_5_str = [f"{c:.6f} ({self.classes[cls]})" for c, cls in top_5]
-            print(f"  - Top 5 raw confidences: {top_5_str}")
-            print(f"  - Confidence stats: min={min(all_raw_confs):.6f}, max={max(all_raw_confs):.6f}, mean={np.mean(all_raw_confs):.6f}")
-            print(f"  - Confidences > 0.01: {sum(1 for c in all_raw_confs if c > 0.01)}")
-            print(f"  - Confidences > 0.001: {sum(1 for c in all_raw_confs if c > 0.001)}")
-            print(f"  - Confidences > 0.0001: {sum(1 for c in all_raw_confs if c > 0.0001)}")
-        # Now run with actual threshold
-        results = self.model(image, conf=confidence_threshold, imgsz=640, iou=0.55)
-        print(f"  - Filtered detections (conf={confidence_threshold}): {len(results)}")
-        # Get original dimensions for coordinate transformation
-        orig_h, orig_w = original_image.shape[:2]
-        print(f"\n[STEP 5] DETECTION RESULTS")
-        for result in results:
-            boxes = result.boxes
-            print(f"  - Total boxes after NMS (confidence >= {self.conf_threshold}): {len(boxes)}")
-            # Debug: print all raw predictions before NMS
-            if hasattr(result, 'boxes') and len(result.boxes) == 0:
-                print(f"  - Note: Model raw output available but filtered by NMS/confidence")
-                if hasattr(result, 'probs'):
-                    print(f"  - Raw predictions present: {result.probs}")
-            # Debug: print summary
-            if len(boxes) > 0:
-                confidences = [float(box.conf[0]) for box in boxes]
-                print(f"  - Confidence range: {min(confidences):.4f} - {max(confidences):.4f}")
-                print(f"  - Mean confidence: {np.mean(confidences):.4f}")
-            else:
-                print(f"  - No detections above threshold {self.conf_threshold}")
-                print(f"  - Model may not have detected any objects in this image")
-            for box in boxes:
-                # Get bounding box coordinates from letterboxed image
-                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
-                # Convert coordinates back to original image space
-                x1 = max(0, int((x1 - pad_x) / scale))
-                y1 = max(0, int((y1 - pad_y) / scale))
-                x2 = min(orig_w, int((x2 - pad_x) / scale))
-                y2 = min(orig_h, int((y2 - pad_y) / scale))
-                conf = box.conf[0].cpu().numpy()
-                cls = int(box.cls[0].cpu().numpy())
-                print(f"Detected: {self.classes[cls]} with conf {conf:.4f} at ({x1},{y1})-({x2},{y2})")
-                # Only draw if confidence meets threshold
-                if conf >= confidence_threshold:
-                    # Draw bounding box on original image
-                    cv2.rectangle(original_image, (x1, y1), (x2, y2), self.box_color, self.thickness)
-                    # Draw label
-                    label = f"{self.classes[cls]}: {conf:.2f}"
-                    cv2.putText(original_image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.text_color, 2)
         print(f"\n{'='*80}")
         print(f"DETECTION PIPELINE COMPLETE")
-        print(f"{'='*80}")
-        # Analysis and recommendations
-        print(f"\n📋 ANALYSIS & RECOMMENDATIONS:")
-        # Check for raw detections issue
-        if raw_box_count > 0 and max([float(box.conf[0]) for box in results_raw[0].boxes]) < 0.01:
-            print(f"  ⚠️  MODEL CONFIDENCE ISSUE:")
-            print(f"     - Model detects {raw_box_count} objects but all with confidence < 0.01")
-            print(f"     - This indicates the model may not be well-trained for this domain")
-            print(f"     - Possible causes:")
-            print(f"       a) Model trained on different dataset/resolution")
-            print(f"       b) Model underfitted (needs more training epochs)")
-            print(f"       c) Training data does not match inference data")
-            print(f"       d) Model weights not properly saved")
-            print(f"     - Solutions:")
-            print(f"       1) Retrain model with proper hyperparameters (100+ epochs)")
-            print(f"       2) Use augmentation during training")
-            print(f"       3) Check training/validation accuracy was good")
-            print(f"       4) Ensure training data matches inference image types")
-            print(f"     - Try lowering the confidence threshold slider to see detections")
-        if scale < 0.5:
-            print(f"\n  ⚠️  SCALING ISSUE:")
-            print(f"     - Objects too small after resizing (scale={scale:.2f})")
-            print(f"     - Current: {original_image.shape} → {image.shape}")
-            print(f"     - Solutions: use larger imgsz (1024/1280) or smaller input images")
-        print()
         return original_image, preprocessed_display

 from huggingface_hub import hf_hub_download
 import os
 import torch
+from collections import defaultdict
 class TrafficSignDetector:
     def __init__(self, config_path):
         print("="*80 + "\n")
+    def _create_tiles(self, image, overlap_ratio=0.2):
         """
+        Cắt ảnh thành các tiles vuông với overlap.
         :param image: input image (numpy array)
+        :param overlap_ratio: tỉ lệ overlap giữa các tiles (0.2 = 20%)
+        :return: list of (tile_image, tile_coords) - tile_coords = (y1, x1, y2, x2) trong ảnh gốc
         """
         height, width = image.shape[:2]
+        min_dim = min(height, width)
+        print(f"\n[TILING] Image: {width}x{height}, Min dimension: {min_dim}")
+        # Xác định stride (bước nhảy) dựa trên overlap
+        # Nếu overlap = 20%, thì stride = 80% của tile_size
+        stride = int(min_dim * (1 - overlap_ratio))
+        tiles = []
+        tile_size = min_dim
+        # Tạo grid tiles
+        y = 0
+        while y < height:
+            y_end = min(y + tile_size, height)
+            # Nếu đây là tiles cuối cùng, đảm bảo nó có đủ kích thước
+            if y_end - y < tile_size and y > 0:
+                y = height - tile_size
+                y_end = height
+            x = 0
+            while x < width:
+                x_end = min(x + tile_size, width)
+                # Nếu đây là tiles cuối cùng, đảm bảo nó có đủ kích thước
+                if x_end - x < tile_size and x > 0:
+                    x = width - tile_size
+                    x_end = width
+                # Extract tile
+                tile = image[y:y_end, x:x_end]
+                tiles.append({
+                    'image': tile,
+                    'y_min': y,
+                    'x_min': x,
+                    'y_max': y_end,
+                    'x_max': x_end
+                })
+                x += stride
+                if x >= width:
+                    break
+            y += stride
+            if y >= height:
+                break
+        print(f"  - Tile size: {tile_size}x{tile_size}")
+        print(f"  - Stride: {stride} (overlap: {overlap_ratio*100:.0f}%)")
+        print(f"  - Số tiles: {len(tiles)}")
+        return tiles
+    def _select_standard_size(self, tile_size):
+        """
+        Chọn kích thước chuẩn gần nhất cho tile.
+        :param tile_size: kích thước hiện tại
+        :return: kích thước chuẩn (640, 960, hoặc 1024)
+        """
+        standard_sizes = [640, 960, 1024]
+        # Chọn size nhỏ nhất mà >= tile_size
+        for size in standard_sizes:
+            if size >= tile_size:
+                return size
+        return 1024  # Fallback to largest
+    def _resize_to_standard(self, tile, target_size=640):
+        """
+        Resize tile về size chuẩn với letterbox padding.
+        :param tile: tile image
+        :param target_size: target size (640, 960, hoặc 1024)
+        :return: (resized_image, scale, pad_x, pad_y)
+        """
+        height, width = tile.shape[:2]
         max_dim = max(width, height)
         # Scale to fit target while maintaining aspect ratio
         new_height = int(height * scale)
         # Resize image
+        resized = cv2.resize(tile, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
+        # Create canvas and place resized image (letterbox)
         canvas = np.full((target_size, target_size, 3), (114, 114, 114), dtype=np.uint8)
         pad_x = (target_size - new_width) // 2
         pad_y = (target_size - new_height) // 2
         canvas[pad_y:pad_y + new_height, pad_x:pad_x + new_width] = resized
         return canvas, scale, pad_x, pad_y
+    def _ensure_square(self, image, target_size=640):
+        """
+        Adjust image to square while maintaining aspect ratio.
+        Deprecated: use _resize_to_standard instead.
+        """
+        return self._resize_to_standard(image, target_size)
     def _preprocess(self, image):
         """
         Preprocess image: keep uint8 format as YOLO expects.
         print(f"Image format: {image.dtype}, Min: {image.min()}, Max: {image.max()}, Mean: {image.mean():.1f}")
         return image
+    def _merge_detections(self, all_detections, overlap_threshold=0.5):
+        """
+        Merge detections từ nhiều tiles, loại bỏ duplicates.
+        Sử dụng NMS để gộp detections từ overlapping regions.
+        :param all_detections: list of {
+            'x1': int, 'y1': int, 'x2': int, 'y2': int,
+            'conf': float, 'cls': int
+        }
+        :param overlap_threshold: IOU threshold cho NMS
+        :return: merged_detections
+        """
+        if not all_detections:
+            return []
+        # Sort by confidence (descending)
+        all_detections = sorted(all_detections, key=lambda x: x['conf'], reverse=True)
+        merged = []
+        used = [False] * len(all_detections)
+        for i, det in enumerate(all_detections):
+            if used[i]:
+                continue
+            # Add this detection
+            merged.append(det)
+            used[i] = True
+            # Mark overlapping detections as used
+            for j in range(i + 1, len(all_detections)):
+                if used[j]:
+                    continue
+                # Calculate IOU
+                x1_inter = max(det['x1'], all_detections[j]['x1'])
+                y1_inter = max(det['y1'], all_detections[j]['y1'])
+                x2_inter = min(det['x2'], all_detections[j]['x2'])
+                y2_inter = min(det['y2'], all_detections[j]['y2'])
+                if x2_inter < x1_inter or y2_inter < y1_inter:
+                    continue  # No intersection
+                inter_area = (x2_inter - x1_inter) * (y2_inter - y1_inter)
+                det_area = (det['x2'] - det['x1']) * (det['y2'] - det['y1'])
+                other_area = (all_detections[j]['x2'] - all_detections[j]['x1']) * (all_detections[j]['y2'] - all_detections[j]['y1'])
+                union_area = det_area + other_area - inter_area
+                iou = inter_area / union_area if union_area > 0 else 0
+                # Mark as duplicate if IOU > threshold
+                if iou > overlap_threshold:
+                    used[j] = True
+        return merged
     def detect(self, image, confidence_threshold=None):
         """
+        Perform inference on the image using tiling strategy.
+        Cắt ảnh thành tiles, inference từng tile, sau đó merge kết quả.
         :param image: numpy array of the image
         :param confidence_threshold: optional override for confidence threshold
         :return: tuple of (image with drawn bounding boxes, preprocessed image for visualization)
             confidence_threshold = self.conf_threshold
         else:
             confidence_threshold = float(confidence_threshold)
         print(f"\n{'='*80}")
+        print(f"DETECTION PIPELINE START (TILING STRATEGY)")
         print(f"{'='*80}")
         print(f"[STEP 1] INPUT IMAGE")
         print(f"  - Shape: {image.shape}")
         print(f"  - dtype: {image.dtype}")
         print(f"  - Range: [{image.min()}, {image.max()}]")
+        # Store original image for drawing
         original_image = image.copy()
+        orig_h, orig_w = original_image.shape[:2]
+        # STEP 2: Tạo tiles
+        print(f"\n[STEP 2] TILING")
+        tiles = self._create_tiles(original_image, overlap_ratio=0.2)
+        # STEP 3: Xử lý từng tile
+        print(f"\n[STEP 3] PROCESSING TILES")
+        all_detections = []
+        for tile_idx, tile_info in enumerate(tiles):
+            print(f"\n  [TILE {tile_idx + 1}/{len(tiles)}]")
+            print(f"    Position in original: ({tile_info['x_min']}, {tile_info['y_min']}) → ({tile_info['x_max']}, {tile_info['y_max']})")
+            tile = tile_info['image']
+            tile_h, tile_w = tile.shape[:2]
+            # Chọn kích thước chuẩn
+            standard_size = self._select_standard_size(max(tile_w, tile_h))
+            print(f"    Tile size: {tile_w}x{tile_h} → Standard size: {standard_size}x{standard_size}")
+            # Resize tile
+            resized_tile, scale, pad_x, pad_y = self._resize_to_standard(tile, target_size=standard_size)
+            # Inference
+            results = self.model(resized_tile, conf=0.0, imgsz=standard_size, iou=0.55)
+            # Process results
+            for result in results:
+                boxes = result.boxes
+                print(f"    Detections in this tile: {len(boxes)}")
+                for box in boxes:
+                    # Get coordinates in resized tile space
+                    x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
+                    # Transform back to original tile space
+                    x1 = int((x1 - pad_x) / scale)
+                    y1 = int((y1 - pad_y) / scale)
+                    x2 = int((x2 - pad_x) / scale)
+                    y2 = int((y2 - pad_y) / scale)
+                    # Clamp to tile bounds
+                    x1 = max(0, min(x1, tile_w))
+                    y1 = max(0, min(y1, tile_h))
+                    x2 = max(0, min(x2, tile_w))
+                    y2 = max(0, min(y2, tile_h))
+                    # Transform to original image space
+                    x1_orig = x1 + tile_info['x_min']
+                    y1_orig = y1 + tile_info['y_min']
+                    x2_orig = x2 + tile_info['x_min']
+                    y2_orig = y2 + tile_info['y_min']
+                    # Clamp to original image bounds
+                    x1_orig = max(0, min(x1_orig, orig_w))
+                    y1_orig = max(0, min(y1_orig, orig_h))
+                    x2_orig = max(0, min(x2_orig, orig_w))
+                    y2_orig = max(0, min(y2_orig, orig_h))
+                    conf = float(box.conf[0].cpu().numpy())
+                    cls = int(box.cls[0].cpu().numpy())
+                    all_detections.append({
+                        'x1': x1_orig,
+                        'y1': y1_orig,
+                        'x2': x2_orig,
+                        'y2': y2_orig,
+                        'conf': conf,
+                        'cls': cls
+                    })
+        # STEP 4: Merge detections
+        print(f"\n[STEP 4] MERGING DETECTIONS")
+        print(f"  - Raw detections from all tiles: {len(all_detections)}")
+        merged_detections = self._merge_detections(all_detections, overlap_threshold=0.5)
+        print(f"  - After deduplication: {len(merged_detections)}")
+        # STEP 5: Filter by confidence threshold
+        print(f"\n[STEP 5] FILTERING & DRAWING")
+        print(f"  - Confidence threshold: {confidence_threshold}")
+        drawn_count = 0
+        for det in merged_detections:
+            if det['conf'] >= confidence_threshold:
+                x1, y1, x2, y2 = det['x1'], det['y1'], det['x2'], det['y2']
+                cls = det['cls']
+                conf = det['conf']
+                # Draw bounding box
+                cv2.rectangle(original_image, (x1, y1), (x2, y2), self.box_color, self.thickness)
+                # Draw label
+                label = f"{self.classes[cls]}: {conf:.2f}"
+                cv2.putText(original_image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, self.text_color, 2)
+                drawn_count += 1
+                print(f"  ✓ {self.classes[cls]}: conf={conf:.4f} at ({x1},{y1})-({x2},{y2})")
+        print(f"\n  - Drawn: {drawn_count}/{len(merged_detections)}")
         print(f"\n{'='*80}")
         print(f"DETECTION PIPELINE COMPLETE")
+        print(f"{'='*80}\n")
+        # Create preprocessed visualization (first tile for reference)
+        preprocessed_display = tiles[0]['image'].copy() if tiles else original_image.copy()
         return original_image, preprocessed_display