Spaces:

lml2008
/

YOLO-3D

Sleeping

App Files Files Community

hoeirup commited on Mar 12, 2025

Commit

a277f69

0 Parent(s):

Initial commit with all files at root level

Browse files

Files changed (8) hide show

.gitignore +31 -0
README.md +96 -0
bbox3d_utils.py +799 -0
depth_model.py +184 -0
detection_model.py +243 -0
load_camera_params.py +122 -0
requirements.txt +15 -0
run.py +333 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+# Python cache files
+__pycache__/
+*.py[cod]
+*$py.class
+# Model files
+*.pt
+*.pth
+*.onnx
+*.tflite
+*.pb
+# Video files
+*.mp4
+*.avi
+*.mov
+*.mkv
+# Environment
+.env
+.venv
+env/
+venv/
+ENV/
+# Logs
+*.log
+# OS specific
+.DS_Store
+Thumbs.db

README.md ADDED Viewed

	@@ -0,0 +1,96 @@

+# YOLO-3D
+A real-time 3D object detection system that combines YOLOv11 for object detection with Depth Anything v2 for depth estimation to create pseudo-3D bounding boxes and bird's eye view visualization.
+## Features
+- Real-time object detection using YOLOv11
+- Depth estimation using Depth Anything v2
+- 3D bounding box visualization
+- Bird's Eye View (BEV) visualization
+- Object tracking capabilities
+- Support for video files and webcam input
+- Adjustable model sizes for performance/accuracy tradeoffs
+## Requirements
+- Python 3.8+
+- PyTorch 2.0+
+- OpenCV
+- NumPy
+- Other dependencies listed in `requirements.txt`
+## Installation
+1. Clone this repository:
+   ```
+   git clone https://github.com/niconielsen32/YOLO-3D.git
+   cd YOLO-3D
+   ```
+2. Install dependencies:
+   ```
+   pip install -r requirements.txt
+   ```
+3. Download model weights (will be downloaded automatically on first run)
+## Usage
+Run the main script:
+```bash
+python run.py
+```
+### Configuration Options
+You can modify the following parameters in `run.py`:
+- **Input/Output**:
+  - `source`: Path to input video file or webcam index (0 for default camera)
+  - `output_path`: Path to output video file
+- **Model Settings**:
+  - `yolo_model_size`: YOLOv11 model size ("nano", "small", "medium", "large", "extra")
+  - `depth_model_size`: Depth Anything v2 model size ("small", "base", "large")
+- **Detection Settings**:
+  - `conf_threshold`: Confidence threshold for object detection
+  - `iou_threshold`: IoU threshold for NMS
+  - `classes`: Filter by class, e.g., [0, 1, 2] for specific classes, None for all classes
+- **Feature Toggles**:
+  - `enable_tracking`: Enable object tracking
+  - `enable_bev`: Enable Bird's Eye View visualization
+  - `enable_pseudo_3d`: Enable pseudo-3D visualization
+## Project Structure
+```
+YOLO-3D/
+│── run.py                  # Main script
+│── detection_model.py      # YOLOv11 object detection
+│── depth_model.py          # Depth Anything v2 depth estimation
+│── bbox3d_utils.py         # 3D bounding box utilities
+│── load_camera_params.py   # Camera parameter utilities
+├── requirements.txt            # Project dependencies
+└── README.md                   # This file
+```
+## How It Works
+1. **Object Detection**: YOLOv11 detects objects in the frame and provides 2D bounding boxes
+2. **Depth Estimation**: Depth Anything v2 generates a depth map for the entire frame
+3. **3D Box Estimation**: Combines 2D boxes with depth information to create 3D boxes
+4. **Visualization**: Renders 3D boxes and bird's eye view for better spatial understanding
+## License
+[MIT License](LICENSE)
+## Acknowledgments
+- YOLOv11 by Ultralytics
+- Depth Anything v2 by Microsoft

bbox3d_utils.py ADDED Viewed

	@@ -0,0 +1,799 @@

+import numpy as np
+import cv2
+from scipy.spatial.transform import Rotation as R
+from filterpy.kalman import KalmanFilter
+from collections import defaultdict
+import math
+# Default camera intrinsic matrix (can be overridden)
+DEFAULT_K = np.array([
+    [718.856, 0.0, 607.1928],
+    [0.0, 718.856, 185.2157],
+    [0.0, 0.0, 1.0]
+])
+# Default camera projection matrix (can be overridden)
+DEFAULT_P = np.array([
+    [718.856, 0.0, 607.1928, 45.38225],
+    [0.0, 718.856, 185.2157, -0.1130887],
+    [0.0, 0.0, 1.0, 0.003779761]
+])
+# Average dimensions for common objects (height, width, length) in meters
+DEFAULT_DIMS = {
+    'car': np.array([1.52, 1.64, 3.85]),
+    'truck': np.array([3.07, 2.63, 11.17]),
+    'bus': np.array([3.07, 2.63, 11.17]),
+    'motorcycle': np.array([1.50, 0.90, 2.20]),
+    'bicycle': np.array([1.40, 0.70, 1.80]),
+    'person': np.array([1.75, 0.60, 0.60]),  # Adjusted width/length for person
+    'dog': np.array([0.80, 0.50, 1.10]),
+    'cat': np.array([0.40, 0.30, 0.70]),
+    # Add indoor objects
+    'potted plant': np.array([0.80, 0.40, 0.40]),  # Reduced size for indoor plants
+    'plant': np.array([0.80, 0.40, 0.40]),  # Alias for potted plant
+    'chair': np.array([0.80, 0.60, 0.60]),
+    'sofa': np.array([0.80, 0.85, 2.00]),
+    'table': np.array([0.75, 1.20, 1.20]),
+    'bed': np.array([0.60, 1.50, 2.00]),
+    'tv': np.array([0.80, 0.15, 1.20]),
+    'laptop': np.array([0.02, 0.25, 0.35]),
+    'keyboard': np.array([0.03, 0.15, 0.45]),
+    'mouse': np.array([0.03, 0.06, 0.10]),
+    'book': np.array([0.03, 0.20, 0.15]),
+    'bottle': np.array([0.25, 0.10, 0.10]),
+    'cup': np.array([0.10, 0.08, 0.08]),
+    'vase': np.array([0.30, 0.15, 0.15])
+}
+class BBox3DEstimator:
+    """
+    3D bounding box estimation from 2D detections and depth
+    """
+    def __init__(self, camera_matrix=None, projection_matrix=None, class_dims=None):
+        """
+        Initialize the 3D bounding box estimator
+        Args:
+            camera_matrix (numpy.ndarray): Camera intrinsic matrix (3x3)
+            projection_matrix (numpy.ndarray): Camera projection matrix (3x4)
+            class_dims (dict): Dictionary mapping class names to dimensions (height, width, length)
+        """
+        self.K = camera_matrix if camera_matrix is not None else DEFAULT_K
+        self.P = projection_matrix if projection_matrix is not None else DEFAULT_P
+        self.dims = class_dims if class_dims is not None else DEFAULT_DIMS
+        # Initialize Kalman filters for tracking 3D boxes
+        self.kf_trackers = {}
+        # Store history of 3D boxes for filtering
+        self.box_history = defaultdict(list)
+        self.max_history = 5
+    def estimate_3d_box(self, bbox_2d, depth_value, class_name, object_id=None):
+        """
+        Estimate 3D bounding box from 2D bounding box and depth
+        Args:
+            bbox_2d (list): 2D bounding box [x1, y1, x2, y2]
+            depth_value (float): Depth value at the center of the bounding box
+            class_name (str): Class name of the object
+            object_id (int): Object ID for tracking (None for no tracking)
+        Returns:
+            dict: 3D bounding box parameters
+        """
+        # Get 2D box center and dimensions
+        x1, y1, x2, y2 = bbox_2d
+        center_x = (x1 + x2) / 2
+        center_y = (y1 + y2) / 2
+        width_2d = x2 - x1
+        height_2d = y2 - y1
+        # Get dimensions for the class
+        if class_name.lower() in self.dims:
+            dimensions = self.dims[class_name.lower()].copy()  # Make a copy to avoid modifying the original
+        else:
+            # Use default car dimensions if class not found
+            dimensions = self.dims['car'].copy()
+        # Adjust dimensions based on 2D box aspect ratio and size
+        aspect_ratio_2d = width_2d / height_2d if height_2d > 0 else 1.0
+        # For plants, adjust dimensions based on 2D box
+        if 'plant' in class_name.lower() or 'potted plant' in class_name.lower():
+            # Scale height based on 2D box height
+            dimensions[0] = height_2d / 120  # Convert pixels to meters with a scaling factor
+            # Make width and length proportional to height
+            dimensions[1] = dimensions[0] * 0.6  # width
+            dimensions[2] = dimensions[0] * 0.6  # length
+        # For people, adjust dimensions based on 2D box
+        elif 'person' in class_name.lower():
+            # Scale height based on 2D box height
+            dimensions[0] = height_2d / 100  # Convert pixels to meters with a scaling factor
+            # Make width and length proportional to height
+            dimensions[1] = dimensions[0] * 0.3  # width
+            dimensions[2] = dimensions[0] * 0.3  # length
+        # Convert depth to distance - use a larger range for better visualization
+        # Map depth_value (0-1) to a range of 1-10 meters
+        distance = 1.0 + depth_value * 9.0  # Increased from 4.0 to 9.0 for a larger range
+        # Calculate 3D location
+        location = self._backproject_point(center_x, center_y, distance)
+        # For plants, adjust y-coordinate to place them on a surface
+        if 'plant' in class_name.lower() or 'potted plant' in class_name.lower():
+            # Assume plants are on a surface (e.g., table, floor)
+            # Adjust y-coordinate based on the bottom of the 2D bounding box
+            bottom_y = y2  # Bottom of the 2D box
+            location[1] = self._backproject_point(center_x, bottom_y, distance)[1]
+        # Estimate orientation
+        orientation = self._estimate_orientation(bbox_2d, location, class_name)
+        # Create 3D box
+        box_3d = {
+            'dimensions': dimensions,
+            'location': location,
+            'orientation': orientation,
+            'bbox_2d': bbox_2d,
+            'object_id': object_id,
+            'class_name': class_name
+        }
+        # Apply Kalman filtering if tracking is enabled
+        if object_id is not None:
+            box_3d = self._apply_kalman_filter(box_3d, object_id)
+            # Add to history for temporal filtering
+            self.box_history[object_id].append(box_3d)
+            if len(self.box_history[object_id]) > self.max_history:
+                self.box_history[object_id].pop(0)
+            # Apply temporal filtering
+            box_3d = self._apply_temporal_filter(object_id)
+        return box_3d
+    def _backproject_point(self, x, y, depth):
+        """
+        Backproject a 2D point to 3D space
+        Args:
+            x (float): X coordinate in image space
+            y (float): Y coordinate in image space
+            depth (float): Depth value
+        Returns:
+            numpy.ndarray: 3D point (x, y, z) in camera coordinates
+        """
+        # Create homogeneous coordinates
+        point_2d = np.array([x, y, 1.0])
+        # Backproject to 3D
+        # The z-coordinate is the depth
+        # The x and y coordinates are calculated using the inverse of the camera matrix
+        point_3d = np.linalg.inv(self.K) @ point_2d * depth
+        # For indoor scenes, adjust the y-coordinate to be more realistic
+        # In camera coordinates, y is typically pointing down
+        # Adjust y to place objects at a reasonable height
+        # This is a simplification - in a real system, this would be more sophisticated
+        point_3d[1] = point_3d[1] * 0.5  # Scale down y-coordinate
+        return point_3d
+    def _estimate_orientation(self, bbox_2d, location, class_name):
+        """
+        Estimate orientation of the object
+        Args:
+            bbox_2d (list): 2D bounding box [x1, y1, x2, y2]
+            location (numpy.ndarray): 3D location of the object
+            class_name (str): Class name of the object
+        Returns:
+            float: Orientation angle in radians
+        """
+        # Calculate ray from camera to object center
+        theta_ray = np.arctan2(location[0], location[2])
+        # For plants and stationary objects, orientation doesn't matter much
+        # Just use a fixed orientation aligned with the camera view
+        if 'plant' in class_name.lower() or 'potted plant' in class_name.lower():
+            # Plants typically don't have a specific orientation
+            # Just use the ray angle
+            return theta_ray
+        # For people, they might be facing the camera
+        if 'person' in class_name.lower():
+            # Assume person is facing the camera
+            alpha = 0.0
+        else:
+            # For other objects, use the 2D box aspect ratio to estimate orientation
+            x1, y1, x2, y2 = bbox_2d
+            width = x2 - x1
+            height = y2 - y1
+            aspect_ratio = width / height if height > 0 else 1.0
+            # If the object is wide, it might be facing sideways
+            if aspect_ratio > 1.5:
+                # Object is wide, might be facing sideways
+                # Use the position relative to the image center to guess orientation
+                image_center_x = self.K[0, 2]  # Principal point x
+                if (x1 + x2) / 2 < image_center_x:
+                    # Object is on the left side of the image
+                    alpha = np.pi / 2  # Facing right
+                else:
+                    # Object is on the right side of the image
+                    alpha = -np.pi / 2  # Facing left
+            else:
+                # Object has normal proportions, assume it's facing the camera
+                alpha = 0.0
+        # Global orientation
+        rot_y = alpha + theta_ray
+        return rot_y
+    def _init_kalman_filter(self, box_3d):
+        """
+        Initialize a Kalman filter for a new object
+        Args:
+            box_3d (dict): 3D bounding box parameters
+        Returns:
+            filterpy.kalman.KalmanFilter: Initialized Kalman filter
+        """
+        # State: [x, y, z, width, height, length, yaw, vx, vy, vz, vyaw]
+        kf = KalmanFilter(dim_x=11, dim_z=7)
+        # Initial state
+        kf.x = np.array([
+            box_3d['location'][0],
+            box_3d['location'][1],
+            box_3d['location'][2],
+            box_3d['dimensions'][1],  # width
+            box_3d['dimensions'][0],  # height
+            box_3d['dimensions'][2],  # length
+            box_3d['orientation'],
+            0, 0, 0, 0  # Initial velocities
+        ])
+        # State transition matrix (motion model)
+        dt = 1.0  # Time step
+        kf.F = np.eye(11)
+        kf.F[0, 7] = dt  # x += vx * dt
+        kf.F[1, 8] = dt  # y += vy * dt
+        kf.F[2, 9] = dt  # z += vz * dt
+        kf.F[6, 10] = dt  # yaw += vyaw * dt
+        # Measurement function
+        kf.H = np.zeros((7, 11))
+        kf.H[0, 0] = 1  # x
+        kf.H[1, 1] = 1  # y
+        kf.H[2, 2] = 1  # z
+        kf.H[3, 3] = 1  # width
+        kf.H[4, 4] = 1  # height
+        kf.H[5, 5] = 1  # length
+        kf.H[6, 6] = 1  # yaw
+        # Measurement uncertainty
+        kf.R = np.eye(7) * 0.1
+        kf.R[0:3, 0:3] *= 1.0  # Location uncertainty
+        kf.R[3:6, 3:6] *= 0.1  # Dimension uncertainty
+        kf.R[6, 6] = 0.3  # Orientation uncertainty
+        # Process uncertainty
+        kf.Q = np.eye(11) * 0.1
+        kf.Q[7:11, 7:11] *= 0.5  # Velocity uncertainty
+        # Initial state uncertainty
+        kf.P = np.eye(11) * 1.0
+        kf.P[7:11, 7:11] *= 10.0  # Velocity uncertainty
+        return kf
+    def _apply_kalman_filter(self, box_3d, object_id):
+        """
+        Apply Kalman filtering to smooth 3D box parameters
+        Args:
+            box_3d (dict): 3D bounding box parameters
+            object_id (int): Object ID for tracking
+        Returns:
+            dict: Filtered 3D bounding box parameters
+        """
+        # Initialize Kalman filter if this is a new object
+        if object_id not in self.kf_trackers:
+            self.kf_trackers[object_id] = self._init_kalman_filter(box_3d)
+        # Get the Kalman filter for this object
+        kf = self.kf_trackers[object_id]
+        # Predict
+        kf.predict()
+        # Update with measurement
+        measurement = np.array([
+            box_3d['location'][0],
+            box_3d['location'][1],
+            box_3d['location'][2],
+            box_3d['dimensions'][1],  # width
+            box_3d['dimensions'][0],  # height
+            box_3d['dimensions'][2],  # length
+            box_3d['orientation']
+        ])
+        kf.update(measurement)
+        # Update box_3d with filtered values
+        filtered_box = box_3d.copy()
+        filtered_box['location'] = np.array([kf.x[0], kf.x[1], kf.x[2]])
+        filtered_box['dimensions'] = np.array([kf.x[4], kf.x[3], kf.x[5]])  # height, width, length
+        filtered_box['orientation'] = kf.x[6]
+        return filtered_box
+    def _apply_temporal_filter(self, object_id):
+        """
+        Apply temporal filtering to smooth 3D box parameters over time
+        Args:
+            object_id (int): Object ID for tracking
+        Returns:
+            dict: Temporally filtered 3D bounding box parameters
+        """
+        history = self.box_history[object_id]
+        if len(history) < 2:
+            return history[-1]
+        # Get the most recent box
+        current_box = history[-1]
+        # Apply exponential moving average to location and orientation
+        alpha = 0.7  # Weight for current measurement (higher = less smoothing)
+        # Initialize with current values
+        filtered_box = current_box.copy()
+        # Apply EMA to location and orientation
+        for i in range(len(history) - 2, -1, -1):
+            weight = alpha * (1 - alpha) ** (len(history) - i - 2)
+            filtered_box['location'] = filtered_box['location'] * (1 - weight) + history[i]['location'] * weight
+            # Handle orientation wrapping
+            angle_diff = history[i]['orientation'] - filtered_box['orientation']
+            if angle_diff > np.pi:
+                angle_diff -= 2 * np.pi
+            elif angle_diff < -np.pi:
+                angle_diff += 2 * np.pi
+            filtered_box['orientation'] += angle_diff * weight
+        return filtered_box
+    def project_box_3d_to_2d(self, box_3d):
+        """
+        Project 3D bounding box corners to 2D image space
+        Args:
+            box_3d (dict): 3D bounding box parameters
+        Returns:
+            numpy.ndarray: 2D points of the 3D box corners (8x2)
+        """
+        # Extract parameters
+        h, w, l = box_3d['dimensions']
+        x, y, z = box_3d['location']
+        rot_y = box_3d['orientation']
+        class_name = box_3d['class_name'].lower()
+        # Get 2D box for reference
+        x1, y1, x2, y2 = box_3d['bbox_2d']
+        center_x = (x1 + x2) / 2
+        center_y = (y1 + y2) / 2
+        width_2d = x2 - x1
+        height_2d = y2 - y1
+        # Create rotation matrix
+        R_mat = np.array([
+            [np.cos(rot_y), 0, np.sin(rot_y)],
+            [0, 1, 0],
+            [-np.sin(rot_y), 0, np.cos(rot_y)]
+        ])
+        # 3D bounding box corners
+        # For plants and stationary objects, make the box more centered
+        if 'plant' in class_name or 'potted plant' in class_name:
+            # For plants, center the box on the plant
+            x_corners = np.array([l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2])
+            y_corners = np.array([h/2, h/2, h/2, h/2, -h/2, -h/2, -h/2, -h/2])  # Center vertically
+            z_corners = np.array([w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2])
+        else:
+            # For other objects, use standard box configuration
+            x_corners = np.array([l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2])
+            y_corners = np.array([0, 0, 0, 0, -h, -h, -h, -h])  # Bottom at y=0
+            z_corners = np.array([w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2])
+        # Rotate and translate corners
+        corners_3d = np.vstack([x_corners, y_corners, z_corners])
+        corners_3d = R_mat @ corners_3d
+        corners_3d[0, :] += x
+        corners_3d[1, :] += y
+        corners_3d[2, :] += z
+        # Project to 2D
+        corners_3d_homo = np.vstack([corners_3d, np.ones((1, 8))])
+        corners_2d_homo = self.P @ corners_3d_homo
+        corners_2d = corners_2d_homo[:2, :] / corners_2d_homo[2, :]
+        # Constrain the 3D box to be within a reasonable distance of the 2D box
+        # This helps prevent wildly incorrect projections
+        mean_x = np.mean(corners_2d[0, :])
+        mean_y = np.mean(corners_2d[1, :])
+        # If the projected box is too far from the 2D box center, adjust it
+        if abs(mean_x - center_x) > width_2d or abs(mean_y - center_y) > height_2d:
+            # Shift the projected points to center on the 2D box
+            shift_x = center_x - mean_x
+            shift_y = center_y - mean_y
+            corners_2d[0, :] += shift_x
+            corners_2d[1, :] += shift_y
+        return corners_2d.T
+    def draw_box_3d(self, image, box_3d, color=(0, 255, 0), thickness=2):
+        """
+        Draw enhanced 3D bounding box on image with better depth perception
+        Args:
+            image (numpy.ndarray): Image to draw on
+            box_3d (dict): 3D bounding box parameters
+            color (tuple): Color in BGR format
+            thickness (int): Line thickness
+        Returns:
+            numpy.ndarray: Image with 3D box drawn
+        """
+        # Get 2D box coordinates
+        x1, y1, x2, y2 = [int(coord) for coord in box_3d['bbox_2d']]
+        # Get depth value for scaling
+        depth_value = box_3d.get('depth_value', 0.5)
+        # Calculate box dimensions
+        width = x2 - x1
+        height = y2 - y1
+        # Calculate the offset for the 3D effect (deeper objects have smaller offset)
+        # Inverse relationship with depth - closer objects have larger offset
+        offset_factor = 1.0 - depth_value
+        offset_x = int(width * 0.3 * offset_factor)
+        offset_y = int(height * 0.3 * offset_factor)
+        # Ensure minimum offset for visibility
+        offset_x = max(15, min(offset_x, 50))
+        offset_y = max(15, min(offset_y, 50))
+        # Create points for the 3D box
+        # Front face (the 2D bounding box)
+        front_tl = (x1, y1)
+        front_tr = (x2, y1)
+        front_br = (x2, y2)
+        front_bl = (x1, y2)
+        # Back face (offset by depth)
+        back_tl = (x1 + offset_x, y1 - offset_y)
+        back_tr = (x2 + offset_x, y1 - offset_y)
+        back_br = (x2 + offset_x, y2 - offset_y)
+        back_bl = (x1 + offset_x, y2 - offset_y)
+        # Create a slightly transparent copy of the image for the 3D effect
+        overlay = image.copy()
+        # Draw the front face (2D bounding box)
+        cv2.rectangle(image, front_tl, front_br, color, thickness)
+        # Draw the connecting lines between front and back faces
+        cv2.line(image, front_tl, back_tl, color, thickness)
+        cv2.line(image, front_tr, back_tr, color, thickness)
+        cv2.line(image, front_br, back_br, color, thickness)
+        cv2.line(image, front_bl, back_bl, color, thickness)
+        # Draw the back face
+        cv2.line(image, back_tl, back_tr, color, thickness)
+        cv2.line(image, back_tr, back_br, color, thickness)
+        cv2.line(image, back_br, back_bl, color, thickness)
+        cv2.line(image, back_bl, back_tl, color, thickness)
+        # Fill the top face with a semi-transparent color to enhance 3D effect
+        pts_top = np.array([front_tl, front_tr, back_tr, back_tl], np.int32)
+        pts_top = pts_top.reshape((-1, 1, 2))
+        cv2.fillPoly(overlay, [pts_top], color)
+        # Fill the right face with a semi-transparent color
+        pts_right = np.array([front_tr, front_br, back_br, back_tr], np.int32)
+        pts_right = pts_right.reshape((-1, 1, 2))
+        # Darken the right face color for better 3D effect
+        right_color = (int(color[0] * 0.7), int(color[1] * 0.7), int(color[2] * 0.7))
+        cv2.fillPoly(overlay, [pts_right], right_color)
+        # Apply the overlay with transparency
+        alpha = 0.3  # Transparency factor
+        cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)
+        # Get class name and object ID
+        class_name = box_3d['class_name']
+        obj_id = box_3d['object_id'] if 'object_id' in box_3d else None
+        # Draw text information
+        text_y = y1 - 10
+        if obj_id is not None:
+            cv2.putText(image, f"ID:{obj_id}", (x1, text_y),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+            text_y -= 15
+        cv2.putText(image, class_name, (x1, text_y),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+        text_y -= 15
+        # Get depth information if available
+        if 'depth_value' in box_3d:
+            depth_value = box_3d['depth_value']
+            depth_method = box_3d.get('depth_method', 'unknown')
+            depth_text = f"D:{depth_value:.2f} ({depth_method})"
+            cv2.putText(image, depth_text, (x1, text_y),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+            text_y -= 15
+        # Get score if available
+        if 'score' in box_3d:
+            score = box_3d['score']
+            score_text = f"S:{score:.2f}"
+            cv2.putText(image, score_text, (x1, text_y),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+        # Draw a vertical line from the bottom of the box to the ground
+        # This helps with depth perception
+        ground_y = y2 + int(height * 0.2)  # A bit below the bottom of the box
+        cv2.line(image, (int((x1 + x2) / 2), y2), (int((x1 + x2) / 2), ground_y), color, thickness)
+        # Draw a small circle at the bottom to represent the ground contact point
+        cv2.circle(image, (int((x1 + x2) / 2), ground_y), thickness * 2, color, -1)
+        return image
+    def cleanup_trackers(self, active_ids):
+        """
+        Clean up Kalman filters and history for objects that are no longer tracked
+        Args:
+            active_ids (list): List of active object IDs
+        """
+        # Convert to set for faster lookup
+        active_ids_set = set(active_ids)
+        # Clean up Kalman filters
+        for obj_id in list(self.kf_trackers.keys()):
+            if obj_id not in active_ids_set:
+                del self.kf_trackers[obj_id]
+        # Clean up box history
+        for obj_id in list(self.box_history.keys()):
+            if obj_id not in active_ids_set:
+                del self.box_history[obj_id]
+class BirdEyeView:
+    """
+    Bird's Eye View visualization
+    """
+    def __init__(self, size=(400, 400), scale=30, camera_height=1.2):
+        """
+        Initialize the Bird's Eye View visualizer
+        Args:
+            size (tuple): Size of the BEV image (width, height)
+            scale (float): Scale factor (pixels per meter)
+            camera_height (float): Height of the camera above ground (meters)
+        """
+        self.width, self.height = size
+        self.scale = scale
+        self.camera_height = camera_height
+        # Create empty BEV image
+        self.bev_image = np.zeros((self.height, self.width, 3), dtype=np.uint8)
+        # Set origin at the bottom center of the image
+        self.origin_x = self.width // 2
+        self.origin_y = self.height - 50
+    def reset(self):
+        """
+        Reset the BEV image
+        """
+        # Create a dark background
+        self.bev_image = np.zeros((self.height, self.width, 3), dtype=np.uint8)
+        self.bev_image[:, :] = (20, 20, 20)  # Dark gray background
+        # Draw grid lines
+        grid_spacing = max(int(self.scale), 20)  # At least 20 pixels between grid lines
+        # Draw horizontal grid lines
+        for y in range(self.origin_y, 0, -grid_spacing):
+            cv2.line(self.bev_image, (0, y), (self.width, y), (50, 50, 50), 1)
+        # Draw vertical grid lines
+        for x in range(0, self.width, grid_spacing):
+            cv2.line(self.bev_image, (x, 0), (x, self.height), (50, 50, 50), 1)
+        # Draw coordinate system
+        axis_length = min(80, self.height // 5)
+        # X-axis (upward)
+        cv2.line(self.bev_image,
+                (self.origin_x, self.origin_y),
+                (self.origin_x, self.origin_y - axis_length),
+                (0, 200, 0), 2)  # Green for X-axis
+        # Y-axis (rightward)
+        cv2.line(self.bev_image,
+                (self.origin_x, self.origin_y),
+                (self.origin_x + axis_length, self.origin_y),
+                (0, 0, 200), 2)  # Red for Y-axis
+        # Add axis labels
+        cv2.putText(self.bev_image, "X",
+                   (self.origin_x - 15, self.origin_y - axis_length + 15),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 200, 0), 1)
+        cv2.putText(self.bev_image, "Y",
+                   (self.origin_x + axis_length - 15, self.origin_y + 20),
+                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 200), 1)
+        # Draw distance markers specifically for 1-5 meter range
+        # Use fixed steps of 1 meter with intermediate markers at 0.5 meters
+        for dist in [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]:
+            y = self.origin_y - int(dist * self.scale)
+            if y < 20:  # Skip if too close to top
+                continue
+            # Draw tick mark - thicker for whole meters
+            thickness = 2 if dist.is_integer() else 1
+            cv2.line(self.bev_image,
+                    (self.origin_x - 5, y),
+                    (self.origin_x + 5, y),
+                    (120, 120, 120), thickness)
+            # Only show text for whole meters
+            if dist.is_integer():
+                cv2.putText(self.bev_image, f"{int(dist)}m",
+                           (self.origin_x + 10, y + 4),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.4, (180, 180, 180), 1)
+    def draw_box(self, box_3d, color=None):
+        """
+        Draw a more realistic representation of an object on the BEV image
+        Args:
+            box_3d (dict): 3D bounding box parameters
+            color (tuple): Color in BGR format (None for automatic color based on class)
+        """
+        try:
+            # Extract parameters
+            class_name = box_3d['class_name'].lower()
+            # Scale depth to fit within 1-5 meters range
+            depth_value = box_3d.get('depth_value', 0.5)
+            # Map depth value (0-1) to a range of 1-5 meters
+            depth = 1.0 + depth_value * 4.0
+            # Get 2D box dimensions for size estimation
+            if 'bbox_2d' in box_3d:
+                x1, y1, x2, y2 = box_3d['bbox_2d']
+                width_2d = x2 - x1
+                height_2d = y2 - y1
+                size_factor = width_2d / 100
+                size_factor = max(0.5, min(size_factor, 2.0))
+            else:
+                size_factor = 1.0
+            # Determine color based on class
+            if color is None:
+                if 'car' in class_name or 'vehicle' in class_name:
+                    color = (0, 0, 255)  # Red
+                elif 'truck' in class_name or 'bus' in class_name:
+                    color = (0, 165, 255)  # Orange
+                elif 'person' in class_name:
+                    color = (0, 255, 0)  # Green
+                elif 'bicycle' in class_name or 'motorcycle' in class_name:
+                    color = (255, 0, 0)  # Blue
+                elif 'potted plant' in class_name or 'plant' in class_name:
+                    color = (0, 255, 255)  # Yellow
+                else:
+                    color = (255, 255, 255)  # White
+            # Get object ID if available
+            obj_id = box_3d.get('object_id', None)
+            # Calculate position in BEV with flipped axes
+            # X-axis points upward, Y-axis points rightward
+            # Calculate Y position (upward) based on depth
+            bev_y = self.origin_y - int(depth * self.scale)
+            # Calculate X position (rightward) based on horizontal position in image
+            if 'bbox_2d' in box_3d:
+                center_x_2d = (x1 + x2) / 2
+                image_width = self.bev_image.shape[1]
+                rel_x = (center_x_2d / image_width) - 0.5
+                bev_x = self.origin_x + int(rel_x * self.width * 0.6)
+            else:
+                bev_x = self.origin_x
+            # Ensure the object stays within the visible area
+            bev_x = max(20, min(bev_x, self.width - 20))
+            bev_y = max(20, min(bev_y, self.origin_y - 10))
+            # Draw object based on type
+            if 'person' in class_name:
+                # Draw person as a circle
+                radius = int(4 * size_factor)
+                cv2.circle(self.bev_image, (bev_x, bev_y), radius, color, -1)
+            elif 'car' in class_name or 'vehicle' in class_name or 'truck' in class_name or 'bus' in class_name:
+                # Draw vehicle as a rectangle
+                rect_width = int(12 * size_factor)
+                rect_length = int(18 * size_factor)
+                if 'truck' in class_name or 'bus' in class_name:
+                    rect_length = int(24 * size_factor)  # Longer for trucks/buses
+                # Draw vehicle body
+                cv2.rectangle(self.bev_image,
+                             (bev_x - rect_width//2, bev_y - rect_length//2),
+                             (bev_x + rect_width//2, bev_y + rect_length//2),
+                             color, -1)
+            elif 'plant' in class_name or 'potted plant' in class_name:
+                # Draw plant as a circle
+                radius = int(8 * size_factor)
+                cv2.circle(self.bev_image, (bev_x, bev_y), radius, color, -1)
+            else:
+                # Default: draw a square for other objects
+                size = int(8 * size_factor)
+                cv2.rectangle(self.bev_image,
+                             (bev_x - size, bev_y - size),
+                             (bev_x + size, bev_y + size),
+                             color, -1)
+            # Draw object ID if available
+            if obj_id is not None:
+                cv2.putText(self.bev_image, f"{obj_id}",
+                           (bev_x - 5, bev_y - 5),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
+            # Draw distance line from origin to object
+            cv2.line(self.bev_image,
+                    (self.origin_x, self.origin_y),
+                    (bev_x, bev_y),
+                    (70, 70, 70), 1)
+        except Exception as e:
+            print(f"Error drawing box in BEV: {e}")
+    def get_image(self):
+        """
+        Get the BEV image
+        Returns:
+            numpy.ndarray: BEV image
+        """
+        return self.bev_image

depth_model.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import cv2
+from transformers import pipeline
+from PIL import Image
+class DepthEstimator:
+    """
+    Depth estimation using Depth Anything v2
+    """
+    def __init__(self, model_size='small', device=None):
+        """
+        Initialize the depth estimator
+        Args:
+            model_size (str): Model size ('small', 'base', 'large')
+            device (str): Device to run inference on ('cuda', 'cpu', 'mps')
+        """
+        # Determine device
+        if device is None:
+            if torch.cuda.is_available():
+                device = 'cuda'
+            elif hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                device = 'mps'
+            else:
+                device = 'cpu'
+        self.device = device
+        # Set MPS fallback for operations not supported on Apple Silicon
+        if self.device == 'mps':
+            print("Using MPS device with CPU fallback for unsupported operations")
+            os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
+            # For Depth Anything v2, we'll use CPU directly due to MPS compatibility issues
+            self.pipe_device = 'cpu'
+            print("Forcing CPU for depth estimation pipeline due to MPS compatibility issues")
+        else:
+            self.pipe_device = self.device
+        print(f"Using device: {self.device} for depth estimation (pipeline on {self.pipe_device})")
+        # Map model size to model name
+        model_map = {
+            'small': 'depth-anything/Depth-Anything-V2-Small-hf',
+            'base': 'depth-anything/Depth-Anything-V2-Base-hf',
+            'large': 'depth-anything/Depth-Anything-V2-Large-hf'
+        }
+        model_name = model_map.get(model_size.lower(), model_map['small'])
+        # Create pipeline
+        try:
+            self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
+            print(f"Loaded Depth Anything v2 {model_size} model on {self.pipe_device}")
+        except Exception as e:
+            # Fallback to CPU if there are issues
+            print(f"Error loading model on {self.pipe_device}: {e}")
+            print("Falling back to CPU for depth estimation")
+            self.pipe_device = 'cpu'
+            self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
+            print(f"Loaded Depth Anything v2 {model_size} model on CPU (fallback)")
+    def estimate_depth(self, image):
+        """
+        Estimate depth from an image
+        Args:
+            image (numpy.ndarray): Input image (BGR format)
+        Returns:
+            numpy.ndarray: Depth map (normalized to 0-1)
+        """
+        # Convert BGR to RGB
+        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Convert to PIL Image
+        pil_image = Image.fromarray(image_rgb)
+        # Get depth map
+        try:
+            depth_result = self.pipe(pil_image)
+            depth_map = depth_result["depth"]
+            # Convert PIL Image to numpy array if needed
+            if isinstance(depth_map, Image.Image):
+                depth_map = np.array(depth_map)
+            elif isinstance(depth_map, torch.Tensor):
+                depth_map = depth_map.cpu().numpy()
+        except RuntimeError as e:
+            # Handle potential MPS errors during inference
+            if self.device == 'mps':
+                print(f"MPS error during depth estimation: {e}")
+                print("Temporarily falling back to CPU for this frame")
+                # Create a CPU pipeline for this frame
+                cpu_pipe = pipeline(task="depth-estimation", model=self.pipe.model.config._name_or_path, device='cpu')
+                depth_result = cpu_pipe(pil_image)
+                depth_map = depth_result["depth"]
+                # Convert PIL Image to numpy array if needed
+                if isinstance(depth_map, Image.Image):
+                    depth_map = np.array(depth_map)
+                elif isinstance(depth_map, torch.Tensor):
+                    depth_map = depth_map.cpu().numpy()
+            else:
+                # Re-raise the error if not MPS
+                raise
+        # Normalize depth map to 0-1
+        depth_min = depth_map.min()
+        depth_max = depth_map.max()
+        if depth_max > depth_min:
+            depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+        return depth_map
+    def colorize_depth(self, depth_map, cmap=cv2.COLORMAP_INFERNO):
+        """
+        Colorize depth map for visualization
+        Args:
+            depth_map (numpy.ndarray): Depth map (normalized to 0-1)
+            cmap (int): OpenCV colormap
+        Returns:
+            numpy.ndarray: Colorized depth map (BGR format)
+        """
+        depth_map_uint8 = (depth_map * 255).astype(np.uint8)
+        colored_depth = cv2.applyColorMap(depth_map_uint8, cmap)
+        return colored_depth
+    def get_depth_at_point(self, depth_map, x, y):
+        """
+        Get depth value at a specific point
+        Args:
+            depth_map (numpy.ndarray): Depth map
+            x (int): X coordinate
+            y (int): Y coordinate
+        Returns:
+            float: Depth value at (x, y)
+        """
+        if 0 <= y < depth_map.shape[0] and 0 <= x < depth_map.shape[1]:
+            return depth_map[y, x]
+        return 0.0
+    def get_depth_in_region(self, depth_map, bbox, method='median'):
+        """
+        Get depth value in a region defined by a bounding box
+        Args:
+            depth_map (numpy.ndarray): Depth map
+            bbox (list): Bounding box [x1, y1, x2, y2]
+            method (str): Method to compute depth ('median', 'mean', 'min')
+        Returns:
+            float: Depth value in the region
+        """
+        x1, y1, x2, y2 = [int(coord) for coord in bbox]
+        # Ensure coordinates are within image bounds
+        x1 = max(0, x1)
+        y1 = max(0, y1)
+        x2 = min(depth_map.shape[1] - 1, x2)
+        y2 = min(depth_map.shape[0] - 1, y2)
+        # Extract region
+        region = depth_map[y1:y2, x1:x2]
+        if region.size == 0:
+            return 0.0
+        # Compute depth based on method
+        if method == 'median':
+            return float(np.median(region))
+        elif method == 'mean':
+            return float(np.mean(region))
+        elif method == 'min':
+            return float(np.min(region))
+        else:
+            return float(np.median(region))

detection_model.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import os
+import torch
+import numpy as np
+import cv2
+from ultralytics import YOLO
+from collections import deque
+class ObjectDetector:
+    """
+    Object detection using YOLOv11 from Ultralytics
+    """
+    def __init__(self, model_size='small', conf_thres=0.25, iou_thres=0.45, classes=None, device=None):
+        """
+        Initialize the object detector
+        Args:
+            model_size (str): Model size ('nano', 'small', 'medium', 'large', 'extra')
+            conf_thres (float): Confidence threshold for detections
+            iou_thres (float): IoU threshold for NMS
+            classes (list): List of classes to detect (None for all classes)
+            device (str): Device to run inference on ('cuda', 'cpu', 'mps')
+        """
+        # Determine device
+        if device is None:
+            if torch.cuda.is_available():
+                device = 'cuda'
+            elif hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                device = 'mps'
+            else:
+                device = 'cpu'
+        self.device = device
+        # Set MPS fallback for operations not supported on Apple Silicon
+        if self.device == 'mps':
+            print("Using MPS device with CPU fallback for unsupported operations")
+            os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
+        print(f"Using device: {self.device} for object detection")
+        # Map model size to model name
+        model_map = {
+            'nano': 'yolo11n',
+            'small': 'yolo11s',
+            'medium': 'yolo11m',
+            'large': 'yolo11l',
+            'extra': 'yolo11x'
+        }
+        model_name = model_map.get(model_size.lower(), model_map['small'])
+        # Load model
+        try:
+            self.model = YOLO(model_name)
+            print(f"Loaded YOLOv11 {model_size} model on {self.device}")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            print("Trying to load with default settings...")
+            self.model = YOLO(model_name)
+        # Set model parameters
+        self.model.overrides['conf'] = conf_thres
+        self.model.overrides['iou'] = iou_thres
+        self.model.overrides['agnostic_nms'] = False
+        self.model.overrides['max_det'] = 1000
+        if classes is not None:
+            self.model.overrides['classes'] = classes
+        # Initialize tracking trajectories
+        self.tracking_trajectories = {}
+    def detect(self, image, track=True):
+        """
+        Detect objects in an image
+        Args:
+            image (numpy.ndarray): Input image (BGR format)
+            track (bool): Whether to track objects across frames
+        Returns:
+            tuple: (annotated_image, detections)
+                - annotated_image (numpy.ndarray): Image with detections drawn
+                - detections (list): List of detections [bbox, score, class_id, object_id]
+        """
+        detections = []
+        # Make a copy of the image for annotation
+        annotated_image = image.copy()
+        try:
+            if track:
+                # Run inference with tracking
+                results = self.model.track(image, verbose=False, device=self.device, persist=True)
+            else:
+                # Run inference without tracking
+                results = self.model.predict(image, verbose=False, device=self.device)
+        except RuntimeError as e:
+            # Handle potential MPS errors
+            if self.device == 'mps' and "not currently implemented for the MPS device" in str(e):
+                print(f"MPS error during detection: {e}")
+                print("Falling back to CPU for this frame")
+                if track:
+                    results = self.model.track(image, verbose=False, device='cpu', persist=True)
+                else:
+                    results = self.model.predict(image, verbose=False, device='cpu')
+            else:
+                # Re-raise the error if not MPS or not an implementation error
+                raise
+        if track:
+            # Clean up trajectories for objects that are no longer tracked
+            for id_ in list(self.tracking_trajectories.keys()):
+                if id_ not in [int(bbox.id) for predictions in results if predictions is not None
+                              for bbox in predictions.boxes if bbox.id is not None]:
+                    del self.tracking_trajectories[id_]
+            # Process results
+            for predictions in results:
+                if predictions is None:
+                    continue
+                if predictions.boxes is None:
+                    continue
+                # Process boxes
+                for bbox in predictions.boxes:
+                    # Extract information
+                    scores = bbox.conf
+                    classes = bbox.cls
+                    bbox_coords = bbox.xyxy
+                    # Check if tracking IDs are available
+                    if hasattr(bbox, 'id') and bbox.id is not None:
+                        ids = bbox.id
+                    else:
+                        ids = [None] * len(scores)
+                    # Process each detection
+                    for score, class_id, bbox_coord, id_ in zip(scores, classes, bbox_coords, ids):
+                        xmin, ymin, xmax, ymax = bbox_coord.cpu().numpy()
+                        # Add to detections list
+                        detections.append([
+                            [xmin, ymin, xmax, ymax],  # bbox
+                            float(score),              # confidence score
+                            int(class_id),             # class id
+                            int(id_) if id_ is not None else None  # object id
+                        ])
+                        # Draw bounding box
+                        cv2.rectangle(annotated_image,
+                                     (int(xmin), int(ymin)),
+                                     (int(xmax), int(ymax)),
+                                     (0, 0, 225), 2)
+                        # Add label
+                        label = f"ID: {int(id_) if id_ is not None else 'N/A'} {predictions.names[int(class_id)]} {float(score):.2f}"
+                        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+                        dim, baseline = text_size[0], text_size[1]
+                        cv2.rectangle(annotated_image,
+                                     (int(xmin), int(ymin)),
+                                     (int(xmin) + dim[0], int(ymin) - dim[1] - baseline),
+                                     (30, 30, 30), cv2.FILLED)
+                        cv2.putText(annotated_image, label,
+                                   (int(xmin), int(ymin) - 7),
+                                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+                        # Update tracking trajectories
+                        if id_ is not None:
+                            centroid_x = (xmin + xmax) / 2
+                            centroid_y = (ymin + ymax) / 2
+                            if int(id_) not in self.tracking_trajectories:
+                                self.tracking_trajectories[int(id_)] = deque(maxlen=10)
+                            self.tracking_trajectories[int(id_)].append((centroid_x, centroid_y))
+            # Draw trajectories
+            for id_, trajectory in self.tracking_trajectories.items():
+                for i in range(1, len(trajectory)):
+                    thickness = int(2 * (i / len(trajectory)) + 1)
+                    cv2.line(annotated_image,
+                            (int(trajectory[i-1][0]), int(trajectory[i-1][1])),
+                            (int(trajectory[i][0]), int(trajectory[i][1])),
+                            (255, 255, 255), thickness)
+        else:
+            # Process results for non-tracking mode
+            for predictions in results:
+                if predictions is None:
+                    continue
+                if predictions.boxes is None:
+                    continue
+                # Process boxes
+                for bbox in predictions.boxes:
+                    # Extract information
+                    scores = bbox.conf
+                    classes = bbox.cls
+                    bbox_coords = bbox.xyxy
+                    # Process each detection
+                    for score, class_id, bbox_coord in zip(scores, classes, bbox_coords):
+                        xmin, ymin, xmax, ymax = bbox_coord.cpu().numpy()
+                        # Add to detections list
+                        detections.append([
+                            [xmin, ymin, xmax, ymax],  # bbox
+                            float(score),              # confidence score
+                            int(class_id),             # class id
+                            None                       # object id (None for no tracking)
+                        ])
+                        # Draw bounding box
+                        cv2.rectangle(annotated_image,
+                                     (int(xmin), int(ymin)),
+                                     (int(xmax), int(ymax)),
+                                     (0, 0, 225), 2)
+                        # Add label
+                        label = f"{predictions.names[int(class_id)]} {float(score):.2f}"
+                        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+                        dim, baseline = text_size[0], text_size[1]
+                        cv2.rectangle(annotated_image,
+                                     (int(xmin), int(ymin)),
+                                     (int(xmin) + dim[0], int(ymin) - dim[1] - baseline),
+                                     (30, 30, 30), cv2.FILLED)
+                        cv2.putText(annotated_image, label,
+                                   (int(xmin), int(ymin) - 7),
+                                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+        return annotated_image, detections
+    def get_class_names(self):
+        """
+        Get the names of the classes that the model can detect
+        Returns:
+            list: List of class names
+        """
+        return self.model.names

load_camera_params.py ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/env python3
+import os
+import json
+import numpy as np
+from pathlib import Path
+def load_camera_params(params_file):
+    """
+    Load camera parameters from a JSON file.
+    Args:
+        params_file (str): Path to the JSON file containing camera parameters
+    Returns:
+        dict: Dictionary containing camera parameters
+    """
+    if not os.path.exists(params_file):
+        print(f"Warning: Camera parameters file {params_file} not found. Using default parameters.")
+        return None
+    try:
+        with open(params_file, 'r') as f:
+            params = json.load(f)
+        # Convert lists to numpy arrays
+        params['camera_matrix'] = np.array(params['camera_matrix'])
+        params['dist_coeffs'] = np.array(params['dist_coeffs'])
+        params['projection_matrix'] = np.array(params['projection_matrix'])
+        print(f"Loaded camera parameters from {params_file}")
+        print(f"Camera matrix:\n{params['camera_matrix']}")
+        print(f"Projection matrix:\n{params['projection_matrix']}")
+        return params
+    except Exception as e:
+        print(f"Error loading camera parameters: {e}")
+        return None
+def create_projection_matrix(camera_matrix, R=None, t=None):
+    """
+    Create a projection matrix from camera intrinsic and extrinsic parameters.
+    Args:
+        camera_matrix (numpy.ndarray): Camera intrinsic matrix (3x3)
+        R (numpy.ndarray): Rotation matrix (3x3)
+        t (numpy.ndarray): Translation vector (3x1)
+    Returns:
+        numpy.ndarray: Projection matrix (3x4)
+    """
+    if R is None:
+        R = np.eye(3)
+    if t is None:
+        t = np.zeros((3, 1))
+    # Combine rotation and translation
+    RT = np.hstack((R, t))
+    # Create projection matrix
+    projection_matrix = camera_matrix @ RT
+    return projection_matrix
+def apply_camera_params_to_estimator(bbox3d_estimator, params):
+    """
+    Apply camera parameters to a 3D bounding box estimator.
+    Args:
+        bbox3d_estimator: BBox3DEstimator instance
+        params (dict): Dictionary containing camera parameters
+    Returns:
+        bbox3d_estimator: Updated BBox3DEstimator instance
+    """
+    if params is None:
+        print("Warning: No camera parameters provided. Using default parameters.")
+        return bbox3d_estimator
+    # Update camera matrix
+    if 'camera_matrix' in params:
+        bbox3d_estimator.K = params['camera_matrix']
+    # Update projection matrix
+    if 'projection_matrix' in params:
+        bbox3d_estimator.P = params['projection_matrix']
+    print("Applied camera parameters to 3D bounding box estimator")
+    return bbox3d_estimator
+def main():
+    """Example usage of the camera parameter functions."""
+    # Configuration variables (modify these as needed)
+    # ===============================================
+    # Input file
+    params_file = "camera_params.json"  # Path to camera parameters JSON file
+    # Camera position (for example purposes)
+    camera_height = 1.65  # Camera height above ground in meters
+    # ===============================================
+    # Load camera parameters
+    params = load_camera_params(params_file)
+    if params:
+        print("\nCamera Parameters:")
+        print(f"Image dimensions: {params['image_width']}x{params['image_height']}")
+        print(f"Reprojection error: {params['reprojection_error']}")
+        # Example of creating a projection matrix with different extrinsic parameters
+        print(f"\nExample: Creating a projection matrix with camera raised {camera_height}m above ground")
+        R = np.eye(3)
+        t = np.array([[0], [camera_height], [0]])  # Camera above ground
+        projection_matrix = create_projection_matrix(params['camera_matrix'], R, t)
+        print(f"New projection matrix:\n{projection_matrix}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch>=2.0.0
+torchvision>=0.15.0
+opencv-python>=4.7.0
+numpy>=1.22.0
+ultralytics>=8.0.0  # For YOLOv11
+timm>=0.9.2  # Required for Depth Anything v2
+matplotlib>=3.7.0
+pillow>=9.4.0
+tqdm>=4.65.0
+scipy>=1.10.0
+filterpy>=1.4.5  # For Kalman filtering in tracking
+lap>=0.4.0  # For Hungarian algorithm in tracking
+scikit-image>=0.20.0
+pyyaml>=6.0
+requests>=2.28.0

run.py ADDED Viewed

	@@ -0,0 +1,333 @@

+#!/usr/bin/env python3
+import os
+import sys
+import time
+import cv2
+import numpy as np
+import torch
+from pathlib import Path
+# Set MPS fallback for operations not supported on Apple Silicon
+if hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
+# Import our modules
+from detection_model import ObjectDetector
+from depth_model import DepthEstimator
+from bbox3d_utils import BBox3DEstimator, BirdEyeView
+from load_camera_params import load_camera_params, apply_camera_params_to_estimator
+def main():
+    """Main function."""
+    # Configuration variables (modify these as needed)
+    # ===============================================
+    # Input/Output
+    source = 0  # Path to input video file or webcam index (0 for default camera)
+    output_path = "output.mp4"  # Path to output video file
+    # Model settings
+    yolo_model_size = "nano"  # YOLOv11 model size: "nano", "small", "medium", "large", "extra"
+    depth_model_size = "small"  # Depth Anything v2 model size: "small", "base", "large"
+    # Device settings
+    device = 'cpu'  # Force CPU for stability
+    # Detection settings
+    conf_threshold = 0.25  # Confidence threshold for object detection
+    iou_threshold = 0.45  # IoU threshold for NMS
+    classes = None  # Filter by class, e.g., [0, 1, 2] for specific classes, None for all classes
+    # Feature toggles
+    enable_tracking = True  # Enable object tracking
+    enable_bev = True  # Enable Bird's Eye View visualization
+    enable_pseudo_3d = True  # Enable pseudo-3D visualization
+    # Camera parameters - simplified approach
+    camera_params_file = None  # Path to camera parameters file (None to use default parameters)
+    # ===============================================
+    print(f"Using device: {device}")
+    # Initialize models
+    print("Initializing models...")
+    try:
+        detector = ObjectDetector(
+            model_size=yolo_model_size,
+            conf_thres=conf_threshold,
+            iou_thres=iou_threshold,
+            classes=classes,
+            device=device
+        )
+    except Exception as e:
+        print(f"Error initializing object detector: {e}")
+        print("Falling back to CPU for object detection")
+        detector = ObjectDetector(
+            model_size=yolo_model_size,
+            conf_thres=conf_threshold,
+            iou_thres=iou_threshold,
+            classes=classes,
+            device='cpu'
+        )
+    try:
+        depth_estimator = DepthEstimator(
+            model_size=depth_model_size,
+            device=device
+        )
+    except Exception as e:
+        print(f"Error initializing depth estimator: {e}")
+        print("Falling back to CPU for depth estimation")
+        depth_estimator = DepthEstimator(
+            model_size=depth_model_size,
+            device='cpu'
+        )
+    # Initialize 3D bounding box estimator with default parameters
+    # Simplified approach - focus on 2D detection with depth information
+    bbox3d_estimator = BBox3DEstimator()
+    # Initialize Bird's Eye View if enabled
+    if enable_bev:
+        # Use a scale that works well for the 1-5 meter range
+        bev = BirdEyeView(scale=60, size=(300, 300))  # Increased scale to spread objects out
+    # Open video source
+    try:
+        if isinstance(source, str) and source.isdigit():
+            source = int(source)  # Convert string number to integer for webcam
+    except ValueError:
+        pass  # Keep as string (for video file)
+    print(f"Opening video source: {source}")
+    cap = cv2.VideoCapture(source)
+    if not cap.isOpened():
+        print(f"Error: Could not open video source {source}")
+        return
+    # Get video properties
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    if fps == 0:  # Sometimes happens with webcams
+        fps = 30
+    # Initialize video writer
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    # Initialize variables for FPS calculation
+    frame_count = 0
+    start_time = time.time()
+    fps_display = "FPS: --"
+    print("Starting processing...")
+    # Main loop
+    while True:
+        # Check for key press at the beginning of each loop
+        key = cv2.waitKey(1)
+        if key == ord('q') or key == 27 or (key & 0xFF) == ord('q') or (key & 0xFF) == 27:
+            print("Exiting program...")
+            break
+        try:
+            # Read frame
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Make copies for different visualizations
+            original_frame = frame.copy()
+            detection_frame = frame.copy()
+            depth_frame = frame.copy()
+            result_frame = frame.copy()
+            # Step 1: Object Detection
+            try:
+                detection_frame, detections = detector.detect(detection_frame, track=enable_tracking)
+            except Exception as e:
+                print(f"Error during object detection: {e}")
+                detections = []
+                cv2.putText(detection_frame, "Detection Error", (10, 60),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
+            # Step 2: Depth Estimation
+            try:
+                depth_map = depth_estimator.estimate_depth(original_frame)
+                depth_colored = depth_estimator.colorize_depth(depth_map)
+            except Exception as e:
+                print(f"Error during depth estimation: {e}")
+                # Create a dummy depth map
+                depth_map = np.zeros((height, width), dtype=np.float32)
+                depth_colored = np.zeros((height, width, 3), dtype=np.uint8)
+                cv2.putText(depth_colored, "Depth Error", (10, 60),
+                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
+            # Step 3: 3D Bounding Box Estimation
+            boxes_3d = []
+            active_ids = []
+            for detection in detections:
+                try:
+                    bbox, score, class_id, obj_id = detection
+                    # Get class name
+                    class_name = detector.get_class_names()[class_id]
+                    # Get depth in the region of the bounding box
+                    # Try different methods for depth estimation
+                    if class_name.lower() in ['person', 'cat', 'dog']:
+                        # For people and animals, use the center point depth
+                        center_x = int((bbox[0] + bbox[2]) / 2)
+                        center_y = int((bbox[1] + bbox[3]) / 2)
+                        depth_value = depth_estimator.get_depth_at_point(depth_map, center_x, center_y)
+                        depth_method = 'center'
+                    else:
+                        # For other objects, use the median depth in the region
+                        depth_value = depth_estimator.get_depth_in_region(depth_map, bbox, method='median')
+                        depth_method = 'median'
+                    # Create a simplified 3D box representation
+                    box_3d = {
+                        'bbox_2d': bbox,
+                        'depth_value': depth_value,
+                        'depth_method': depth_method,
+                        'class_name': class_name,
+                        'object_id': obj_id,
+                        'score': score
+                    }
+                    boxes_3d.append(box_3d)
+                    # Keep track of active IDs for tracker cleanup
+                    if obj_id is not None:
+                        active_ids.append(obj_id)
+                except Exception as e:
+                    print(f"Error processing detection: {e}")
+                    continue
+            # Clean up trackers for objects that are no longer detected
+            bbox3d_estimator.cleanup_trackers(active_ids)
+            # Step 4: Visualization
+            # Draw boxes on the result frame
+            for box_3d in boxes_3d:
+                try:
+                    # Determine color based on class
+                    class_name = box_3d['class_name'].lower()
+                    if 'car' in class_name or 'vehicle' in class_name:
+                        color = (0, 0, 255)  # Red
+                    elif 'person' in class_name:
+                        color = (0, 255, 0)  # Green
+                    elif 'bicycle' in class_name or 'motorcycle' in class_name:
+                        color = (255, 0, 0)  # Blue
+                    elif 'potted plant' in class_name or 'plant' in class_name:
+                        color = (0, 255, 255)  # Yellow
+                    else:
+                        color = (255, 255, 255)  # White
+                    # Draw box with depth information
+                    result_frame = bbox3d_estimator.draw_box_3d(result_frame, box_3d, color=color)
+                except Exception as e:
+                    print(f"Error drawing box: {e}")
+                    continue
+            # Draw Bird's Eye View if enabled
+            if enable_bev:
+                try:
+                    # Reset BEV and draw objects
+                    bev.reset()
+                    for box_3d in boxes_3d:
+                        bev.draw_box(box_3d)
+                    bev_image = bev.get_image()
+                    # Resize BEV image to fit in the corner of the result frame
+                    bev_height = height // 4  # Reduced from height/3 to height/4 for better fit
+                    bev_width = bev_height
+                    # Ensure dimensions are valid
+                    if bev_height > 0 and bev_width > 0:
+                        # Resize BEV image
+                        bev_resized = cv2.resize(bev_image, (bev_width, bev_height))
+                        # Create a region of interest in the result frame
+                        roi = result_frame[height - bev_height:height, 0:bev_width]
+                        # Simple overlay - just copy the BEV image to the ROI
+                        result_frame[height - bev_height:height, 0:bev_width] = bev_resized
+                        # Add a border around the BEV visualization
+                        cv2.rectangle(result_frame,
+                                     (0, height - bev_height),
+                                     (bev_width, height),
+                                     (255, 255, 255), 1)
+                        # Add a title to the BEV visualization
+                        cv2.putText(result_frame, "Bird's Eye View",
+                                   (10, height - bev_height + 20),
+                                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+                except Exception as e:
+                    print(f"Error drawing BEV: {e}")
+            # Calculate and display FPS
+            frame_count += 1
+            if frame_count % 10 == 0:  # Update FPS every 10 frames
+                end_time = time.time()
+                elapsed_time = end_time - start_time
+                fps_value = frame_count / elapsed_time
+                fps_display = f"FPS: {fps_value:.1f}"
+            # Add FPS and device info to the result frame
+            cv2.putText(result_frame, f"{fps_display} | Device: {device}", (10, 30),
+                       cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
+            # Add depth map to the corner of the result frame
+            try:
+                depth_height = height // 4
+                depth_width = depth_height * width // height
+                depth_resized = cv2.resize(depth_colored, (depth_width, depth_height))
+                result_frame[0:depth_height, 0:depth_width] = depth_resized
+            except Exception as e:
+                print(f"Error adding depth map to result: {e}")
+            # Write frame to output video
+            out.write(result_frame)
+            # Display frames
+            cv2.imshow("3D Object Detection", result_frame)
+            cv2.imshow("Depth Map", depth_colored)
+            cv2.imshow("Object Detection", detection_frame)
+            # Check for key press again at the end of the loop
+            key = cv2.waitKey(1)
+            if key == ord('q') or key == 27 or (key & 0xFF) == ord('q') or (key & 0xFF) == 27:
+                print("Exiting program...")
+                break
+        except Exception as e:
+            print(f"Error processing frame: {e}")
+            # Also check for key press during exception handling
+            key = cv2.waitKey(1)
+            if key == ord('q') or key == 27 or (key & 0xFF) == ord('q') or (key & 0xFF) == 27:
+                print("Exiting program...")
+                break
+            continue
+    # Clean up
+    print("Cleaning up resources...")
+    cap.release()
+    out.release()
+    cv2.destroyAllWindows()
+    print(f"Processing complete. Output saved to {output_path}")
+if __name__ == "__main__":
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nProgram interrupted by user (Ctrl+C)")
+        # Clean up OpenCV windows
+        cv2.destroyAllWindows()