Commit ·
a277f69
0
Parent(s):
Initial commit with all files at root level
Browse files- .gitignore +31 -0
- README.md +96 -0
- bbox3d_utils.py +799 -0
- depth_model.py +184 -0
- detection_model.py +243 -0
- load_camera_params.py +122 -0
- requirements.txt +15 -0
- run.py +333 -0
.gitignore
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# Model files
|
| 7 |
+
*.pt
|
| 8 |
+
*.pth
|
| 9 |
+
*.onnx
|
| 10 |
+
*.tflite
|
| 11 |
+
*.pb
|
| 12 |
+
|
| 13 |
+
# Video files
|
| 14 |
+
*.mp4
|
| 15 |
+
*.avi
|
| 16 |
+
*.mov
|
| 17 |
+
*.mkv
|
| 18 |
+
|
| 19 |
+
# Environment
|
| 20 |
+
.env
|
| 21 |
+
.venv
|
| 22 |
+
env/
|
| 23 |
+
venv/
|
| 24 |
+
ENV/
|
| 25 |
+
|
| 26 |
+
# Logs
|
| 27 |
+
*.log
|
| 28 |
+
|
| 29 |
+
# OS specific
|
| 30 |
+
.DS_Store
|
| 31 |
+
Thumbs.db
|
README.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# YOLO-3D
|
| 2 |
+
|
| 3 |
+
A real-time 3D object detection system that combines YOLOv11 for object detection with Depth Anything v2 for depth estimation to create pseudo-3D bounding boxes and bird's eye view visualization.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- Real-time object detection using YOLOv11
|
| 8 |
+
- Depth estimation using Depth Anything v2
|
| 9 |
+
- 3D bounding box visualization
|
| 10 |
+
- Bird's Eye View (BEV) visualization
|
| 11 |
+
- Object tracking capabilities
|
| 12 |
+
- Support for video files and webcam input
|
| 13 |
+
- Adjustable model sizes for performance/accuracy tradeoffs
|
| 14 |
+
|
| 15 |
+
## Requirements
|
| 16 |
+
|
| 17 |
+
- Python 3.8+
|
| 18 |
+
- PyTorch 2.0+
|
| 19 |
+
- OpenCV
|
| 20 |
+
- NumPy
|
| 21 |
+
- Other dependencies listed in `requirements.txt`
|
| 22 |
+
|
| 23 |
+
## Installation
|
| 24 |
+
|
| 25 |
+
1. Clone this repository:
|
| 26 |
+
```
|
| 27 |
+
git clone https://github.com/niconielsen32/YOLO-3D.git
|
| 28 |
+
cd YOLO-3D
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
2. Install dependencies:
|
| 32 |
+
```
|
| 33 |
+
pip install -r requirements.txt
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
3. Download model weights (will be downloaded automatically on first run)
|
| 37 |
+
|
| 38 |
+
## Usage
|
| 39 |
+
|
| 40 |
+
Run the main script:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
python run.py
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### Configuration Options
|
| 47 |
+
|
| 48 |
+
You can modify the following parameters in `run.py`:
|
| 49 |
+
|
| 50 |
+
- **Input/Output**:
|
| 51 |
+
- `source`: Path to input video file or webcam index (0 for default camera)
|
| 52 |
+
- `output_path`: Path to output video file
|
| 53 |
+
|
| 54 |
+
- **Model Settings**:
|
| 55 |
+
- `yolo_model_size`: YOLOv11 model size ("nano", "small", "medium", "large", "extra")
|
| 56 |
+
- `depth_model_size`: Depth Anything v2 model size ("small", "base", "large")
|
| 57 |
+
|
| 58 |
+
- **Detection Settings**:
|
| 59 |
+
- `conf_threshold`: Confidence threshold for object detection
|
| 60 |
+
- `iou_threshold`: IoU threshold for NMS
|
| 61 |
+
- `classes`: Filter by class, e.g., [0, 1, 2] for specific classes, None for all classes
|
| 62 |
+
|
| 63 |
+
- **Feature Toggles**:
|
| 64 |
+
- `enable_tracking`: Enable object tracking
|
| 65 |
+
- `enable_bev`: Enable Bird's Eye View visualization
|
| 66 |
+
- `enable_pseudo_3d`: Enable pseudo-3D visualization
|
| 67 |
+
|
| 68 |
+
## Project Structure
|
| 69 |
+
|
| 70 |
+
```
|
| 71 |
+
YOLO-3D/
|
| 72 |
+
|
| 73 |
+
│── run.py # Main script
|
| 74 |
+
│── detection_model.py # YOLOv11 object detection
|
| 75 |
+
│── depth_model.py # Depth Anything v2 depth estimation
|
| 76 |
+
│── bbox3d_utils.py # 3D bounding box utilities
|
| 77 |
+
│── load_camera_params.py # Camera parameter utilities
|
| 78 |
+
├── requirements.txt # Project dependencies
|
| 79 |
+
└── README.md # This file
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
## How It Works
|
| 83 |
+
|
| 84 |
+
1. **Object Detection**: YOLOv11 detects objects in the frame and provides 2D bounding boxes
|
| 85 |
+
2. **Depth Estimation**: Depth Anything v2 generates a depth map for the entire frame
|
| 86 |
+
3. **3D Box Estimation**: Combines 2D boxes with depth information to create 3D boxes
|
| 87 |
+
4. **Visualization**: Renders 3D boxes and bird's eye view for better spatial understanding
|
| 88 |
+
|
| 89 |
+
## License
|
| 90 |
+
|
| 91 |
+
[MIT License](LICENSE)
|
| 92 |
+
|
| 93 |
+
## Acknowledgments
|
| 94 |
+
|
| 95 |
+
- YOLOv11 by Ultralytics
|
| 96 |
+
- Depth Anything v2 by Microsoft
|
bbox3d_utils.py
ADDED
|
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import cv2
|
| 3 |
+
from scipy.spatial.transform import Rotation as R
|
| 4 |
+
from filterpy.kalman import KalmanFilter
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
# Default camera intrinsic matrix (can be overridden)
|
| 9 |
+
DEFAULT_K = np.array([
|
| 10 |
+
[718.856, 0.0, 607.1928],
|
| 11 |
+
[0.0, 718.856, 185.2157],
|
| 12 |
+
[0.0, 0.0, 1.0]
|
| 13 |
+
])
|
| 14 |
+
|
| 15 |
+
# Default camera projection matrix (can be overridden)
|
| 16 |
+
DEFAULT_P = np.array([
|
| 17 |
+
[718.856, 0.0, 607.1928, 45.38225],
|
| 18 |
+
[0.0, 718.856, 185.2157, -0.1130887],
|
| 19 |
+
[0.0, 0.0, 1.0, 0.003779761]
|
| 20 |
+
])
|
| 21 |
+
|
| 22 |
+
# Average dimensions for common objects (height, width, length) in meters
|
| 23 |
+
DEFAULT_DIMS = {
|
| 24 |
+
'car': np.array([1.52, 1.64, 3.85]),
|
| 25 |
+
'truck': np.array([3.07, 2.63, 11.17]),
|
| 26 |
+
'bus': np.array([3.07, 2.63, 11.17]),
|
| 27 |
+
'motorcycle': np.array([1.50, 0.90, 2.20]),
|
| 28 |
+
'bicycle': np.array([1.40, 0.70, 1.80]),
|
| 29 |
+
'person': np.array([1.75, 0.60, 0.60]), # Adjusted width/length for person
|
| 30 |
+
'dog': np.array([0.80, 0.50, 1.10]),
|
| 31 |
+
'cat': np.array([0.40, 0.30, 0.70]),
|
| 32 |
+
# Add indoor objects
|
| 33 |
+
'potted plant': np.array([0.80, 0.40, 0.40]), # Reduced size for indoor plants
|
| 34 |
+
'plant': np.array([0.80, 0.40, 0.40]), # Alias for potted plant
|
| 35 |
+
'chair': np.array([0.80, 0.60, 0.60]),
|
| 36 |
+
'sofa': np.array([0.80, 0.85, 2.00]),
|
| 37 |
+
'table': np.array([0.75, 1.20, 1.20]),
|
| 38 |
+
'bed': np.array([0.60, 1.50, 2.00]),
|
| 39 |
+
'tv': np.array([0.80, 0.15, 1.20]),
|
| 40 |
+
'laptop': np.array([0.02, 0.25, 0.35]),
|
| 41 |
+
'keyboard': np.array([0.03, 0.15, 0.45]),
|
| 42 |
+
'mouse': np.array([0.03, 0.06, 0.10]),
|
| 43 |
+
'book': np.array([0.03, 0.20, 0.15]),
|
| 44 |
+
'bottle': np.array([0.25, 0.10, 0.10]),
|
| 45 |
+
'cup': np.array([0.10, 0.08, 0.08]),
|
| 46 |
+
'vase': np.array([0.30, 0.15, 0.15])
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
class BBox3DEstimator:
|
| 50 |
+
"""
|
| 51 |
+
3D bounding box estimation from 2D detections and depth
|
| 52 |
+
"""
|
| 53 |
+
def __init__(self, camera_matrix=None, projection_matrix=None, class_dims=None):
|
| 54 |
+
"""
|
| 55 |
+
Initialize the 3D bounding box estimator
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
camera_matrix (numpy.ndarray): Camera intrinsic matrix (3x3)
|
| 59 |
+
projection_matrix (numpy.ndarray): Camera projection matrix (3x4)
|
| 60 |
+
class_dims (dict): Dictionary mapping class names to dimensions (height, width, length)
|
| 61 |
+
"""
|
| 62 |
+
self.K = camera_matrix if camera_matrix is not None else DEFAULT_K
|
| 63 |
+
self.P = projection_matrix if projection_matrix is not None else DEFAULT_P
|
| 64 |
+
self.dims = class_dims if class_dims is not None else DEFAULT_DIMS
|
| 65 |
+
|
| 66 |
+
# Initialize Kalman filters for tracking 3D boxes
|
| 67 |
+
self.kf_trackers = {}
|
| 68 |
+
|
| 69 |
+
# Store history of 3D boxes for filtering
|
| 70 |
+
self.box_history = defaultdict(list)
|
| 71 |
+
self.max_history = 5
|
| 72 |
+
|
| 73 |
+
def estimate_3d_box(self, bbox_2d, depth_value, class_name, object_id=None):
|
| 74 |
+
"""
|
| 75 |
+
Estimate 3D bounding box from 2D bounding box and depth
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
bbox_2d (list): 2D bounding box [x1, y1, x2, y2]
|
| 79 |
+
depth_value (float): Depth value at the center of the bounding box
|
| 80 |
+
class_name (str): Class name of the object
|
| 81 |
+
object_id (int): Object ID for tracking (None for no tracking)
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
dict: 3D bounding box parameters
|
| 85 |
+
"""
|
| 86 |
+
# Get 2D box center and dimensions
|
| 87 |
+
x1, y1, x2, y2 = bbox_2d
|
| 88 |
+
center_x = (x1 + x2) / 2
|
| 89 |
+
center_y = (y1 + y2) / 2
|
| 90 |
+
width_2d = x2 - x1
|
| 91 |
+
height_2d = y2 - y1
|
| 92 |
+
|
| 93 |
+
# Get dimensions for the class
|
| 94 |
+
if class_name.lower() in self.dims:
|
| 95 |
+
dimensions = self.dims[class_name.lower()].copy() # Make a copy to avoid modifying the original
|
| 96 |
+
else:
|
| 97 |
+
# Use default car dimensions if class not found
|
| 98 |
+
dimensions = self.dims['car'].copy()
|
| 99 |
+
|
| 100 |
+
# Adjust dimensions based on 2D box aspect ratio and size
|
| 101 |
+
aspect_ratio_2d = width_2d / height_2d if height_2d > 0 else 1.0
|
| 102 |
+
|
| 103 |
+
# For plants, adjust dimensions based on 2D box
|
| 104 |
+
if 'plant' in class_name.lower() or 'potted plant' in class_name.lower():
|
| 105 |
+
# Scale height based on 2D box height
|
| 106 |
+
dimensions[0] = height_2d / 120 # Convert pixels to meters with a scaling factor
|
| 107 |
+
# Make width and length proportional to height
|
| 108 |
+
dimensions[1] = dimensions[0] * 0.6 # width
|
| 109 |
+
dimensions[2] = dimensions[0] * 0.6 # length
|
| 110 |
+
|
| 111 |
+
# For people, adjust dimensions based on 2D box
|
| 112 |
+
elif 'person' in class_name.lower():
|
| 113 |
+
# Scale height based on 2D box height
|
| 114 |
+
dimensions[0] = height_2d / 100 # Convert pixels to meters with a scaling factor
|
| 115 |
+
# Make width and length proportional to height
|
| 116 |
+
dimensions[1] = dimensions[0] * 0.3 # width
|
| 117 |
+
dimensions[2] = dimensions[0] * 0.3 # length
|
| 118 |
+
|
| 119 |
+
# Convert depth to distance - use a larger range for better visualization
|
| 120 |
+
# Map depth_value (0-1) to a range of 1-10 meters
|
| 121 |
+
distance = 1.0 + depth_value * 9.0 # Increased from 4.0 to 9.0 for a larger range
|
| 122 |
+
|
| 123 |
+
# Calculate 3D location
|
| 124 |
+
location = self._backproject_point(center_x, center_y, distance)
|
| 125 |
+
|
| 126 |
+
# For plants, adjust y-coordinate to place them on a surface
|
| 127 |
+
if 'plant' in class_name.lower() or 'potted plant' in class_name.lower():
|
| 128 |
+
# Assume plants are on a surface (e.g., table, floor)
|
| 129 |
+
# Adjust y-coordinate based on the bottom of the 2D bounding box
|
| 130 |
+
bottom_y = y2 # Bottom of the 2D box
|
| 131 |
+
location[1] = self._backproject_point(center_x, bottom_y, distance)[1]
|
| 132 |
+
|
| 133 |
+
# Estimate orientation
|
| 134 |
+
orientation = self._estimate_orientation(bbox_2d, location, class_name)
|
| 135 |
+
|
| 136 |
+
# Create 3D box
|
| 137 |
+
box_3d = {
|
| 138 |
+
'dimensions': dimensions,
|
| 139 |
+
'location': location,
|
| 140 |
+
'orientation': orientation,
|
| 141 |
+
'bbox_2d': bbox_2d,
|
| 142 |
+
'object_id': object_id,
|
| 143 |
+
'class_name': class_name
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
# Apply Kalman filtering if tracking is enabled
|
| 147 |
+
if object_id is not None:
|
| 148 |
+
box_3d = self._apply_kalman_filter(box_3d, object_id)
|
| 149 |
+
|
| 150 |
+
# Add to history for temporal filtering
|
| 151 |
+
self.box_history[object_id].append(box_3d)
|
| 152 |
+
if len(self.box_history[object_id]) > self.max_history:
|
| 153 |
+
self.box_history[object_id].pop(0)
|
| 154 |
+
|
| 155 |
+
# Apply temporal filtering
|
| 156 |
+
box_3d = self._apply_temporal_filter(object_id)
|
| 157 |
+
|
| 158 |
+
return box_3d
|
| 159 |
+
|
| 160 |
+
def _backproject_point(self, x, y, depth):
|
| 161 |
+
"""
|
| 162 |
+
Backproject a 2D point to 3D space
|
| 163 |
+
|
| 164 |
+
Args:
|
| 165 |
+
x (float): X coordinate in image space
|
| 166 |
+
y (float): Y coordinate in image space
|
| 167 |
+
depth (float): Depth value
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
numpy.ndarray: 3D point (x, y, z) in camera coordinates
|
| 171 |
+
"""
|
| 172 |
+
# Create homogeneous coordinates
|
| 173 |
+
point_2d = np.array([x, y, 1.0])
|
| 174 |
+
|
| 175 |
+
# Backproject to 3D
|
| 176 |
+
# The z-coordinate is the depth
|
| 177 |
+
# The x and y coordinates are calculated using the inverse of the camera matrix
|
| 178 |
+
point_3d = np.linalg.inv(self.K) @ point_2d * depth
|
| 179 |
+
|
| 180 |
+
# For indoor scenes, adjust the y-coordinate to be more realistic
|
| 181 |
+
# In camera coordinates, y is typically pointing down
|
| 182 |
+
# Adjust y to place objects at a reasonable height
|
| 183 |
+
# This is a simplification - in a real system, this would be more sophisticated
|
| 184 |
+
point_3d[1] = point_3d[1] * 0.5 # Scale down y-coordinate
|
| 185 |
+
|
| 186 |
+
return point_3d
|
| 187 |
+
|
| 188 |
+
def _estimate_orientation(self, bbox_2d, location, class_name):
|
| 189 |
+
"""
|
| 190 |
+
Estimate orientation of the object
|
| 191 |
+
|
| 192 |
+
Args:
|
| 193 |
+
bbox_2d (list): 2D bounding box [x1, y1, x2, y2]
|
| 194 |
+
location (numpy.ndarray): 3D location of the object
|
| 195 |
+
class_name (str): Class name of the object
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
float: Orientation angle in radians
|
| 199 |
+
"""
|
| 200 |
+
# Calculate ray from camera to object center
|
| 201 |
+
theta_ray = np.arctan2(location[0], location[2])
|
| 202 |
+
|
| 203 |
+
# For plants and stationary objects, orientation doesn't matter much
|
| 204 |
+
# Just use a fixed orientation aligned with the camera view
|
| 205 |
+
if 'plant' in class_name.lower() or 'potted plant' in class_name.lower():
|
| 206 |
+
# Plants typically don't have a specific orientation
|
| 207 |
+
# Just use the ray angle
|
| 208 |
+
return theta_ray
|
| 209 |
+
|
| 210 |
+
# For people, they might be facing the camera
|
| 211 |
+
if 'person' in class_name.lower():
|
| 212 |
+
# Assume person is facing the camera
|
| 213 |
+
alpha = 0.0
|
| 214 |
+
else:
|
| 215 |
+
# For other objects, use the 2D box aspect ratio to estimate orientation
|
| 216 |
+
x1, y1, x2, y2 = bbox_2d
|
| 217 |
+
width = x2 - x1
|
| 218 |
+
height = y2 - y1
|
| 219 |
+
aspect_ratio = width / height if height > 0 else 1.0
|
| 220 |
+
|
| 221 |
+
# If the object is wide, it might be facing sideways
|
| 222 |
+
if aspect_ratio > 1.5:
|
| 223 |
+
# Object is wide, might be facing sideways
|
| 224 |
+
# Use the position relative to the image center to guess orientation
|
| 225 |
+
image_center_x = self.K[0, 2] # Principal point x
|
| 226 |
+
if (x1 + x2) / 2 < image_center_x:
|
| 227 |
+
# Object is on the left side of the image
|
| 228 |
+
alpha = np.pi / 2 # Facing right
|
| 229 |
+
else:
|
| 230 |
+
# Object is on the right side of the image
|
| 231 |
+
alpha = -np.pi / 2 # Facing left
|
| 232 |
+
else:
|
| 233 |
+
# Object has normal proportions, assume it's facing the camera
|
| 234 |
+
alpha = 0.0
|
| 235 |
+
|
| 236 |
+
# Global orientation
|
| 237 |
+
rot_y = alpha + theta_ray
|
| 238 |
+
|
| 239 |
+
return rot_y
|
| 240 |
+
|
| 241 |
+
def _init_kalman_filter(self, box_3d):
|
| 242 |
+
"""
|
| 243 |
+
Initialize a Kalman filter for a new object
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
box_3d (dict): 3D bounding box parameters
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
filterpy.kalman.KalmanFilter: Initialized Kalman filter
|
| 250 |
+
"""
|
| 251 |
+
# State: [x, y, z, width, height, length, yaw, vx, vy, vz, vyaw]
|
| 252 |
+
kf = KalmanFilter(dim_x=11, dim_z=7)
|
| 253 |
+
|
| 254 |
+
# Initial state
|
| 255 |
+
kf.x = np.array([
|
| 256 |
+
box_3d['location'][0],
|
| 257 |
+
box_3d['location'][1],
|
| 258 |
+
box_3d['location'][2],
|
| 259 |
+
box_3d['dimensions'][1], # width
|
| 260 |
+
box_3d['dimensions'][0], # height
|
| 261 |
+
box_3d['dimensions'][2], # length
|
| 262 |
+
box_3d['orientation'],
|
| 263 |
+
0, 0, 0, 0 # Initial velocities
|
| 264 |
+
])
|
| 265 |
+
|
| 266 |
+
# State transition matrix (motion model)
|
| 267 |
+
dt = 1.0 # Time step
|
| 268 |
+
kf.F = np.eye(11)
|
| 269 |
+
kf.F[0, 7] = dt # x += vx * dt
|
| 270 |
+
kf.F[1, 8] = dt # y += vy * dt
|
| 271 |
+
kf.F[2, 9] = dt # z += vz * dt
|
| 272 |
+
kf.F[6, 10] = dt # yaw += vyaw * dt
|
| 273 |
+
|
| 274 |
+
# Measurement function
|
| 275 |
+
kf.H = np.zeros((7, 11))
|
| 276 |
+
kf.H[0, 0] = 1 # x
|
| 277 |
+
kf.H[1, 1] = 1 # y
|
| 278 |
+
kf.H[2, 2] = 1 # z
|
| 279 |
+
kf.H[3, 3] = 1 # width
|
| 280 |
+
kf.H[4, 4] = 1 # height
|
| 281 |
+
kf.H[5, 5] = 1 # length
|
| 282 |
+
kf.H[6, 6] = 1 # yaw
|
| 283 |
+
|
| 284 |
+
# Measurement uncertainty
|
| 285 |
+
kf.R = np.eye(7) * 0.1
|
| 286 |
+
kf.R[0:3, 0:3] *= 1.0 # Location uncertainty
|
| 287 |
+
kf.R[3:6, 3:6] *= 0.1 # Dimension uncertainty
|
| 288 |
+
kf.R[6, 6] = 0.3 # Orientation uncertainty
|
| 289 |
+
|
| 290 |
+
# Process uncertainty
|
| 291 |
+
kf.Q = np.eye(11) * 0.1
|
| 292 |
+
kf.Q[7:11, 7:11] *= 0.5 # Velocity uncertainty
|
| 293 |
+
|
| 294 |
+
# Initial state uncertainty
|
| 295 |
+
kf.P = np.eye(11) * 1.0
|
| 296 |
+
kf.P[7:11, 7:11] *= 10.0 # Velocity uncertainty
|
| 297 |
+
|
| 298 |
+
return kf
|
| 299 |
+
|
| 300 |
+
def _apply_kalman_filter(self, box_3d, object_id):
|
| 301 |
+
"""
|
| 302 |
+
Apply Kalman filtering to smooth 3D box parameters
|
| 303 |
+
|
| 304 |
+
Args:
|
| 305 |
+
box_3d (dict): 3D bounding box parameters
|
| 306 |
+
object_id (int): Object ID for tracking
|
| 307 |
+
|
| 308 |
+
Returns:
|
| 309 |
+
dict: Filtered 3D bounding box parameters
|
| 310 |
+
"""
|
| 311 |
+
# Initialize Kalman filter if this is a new object
|
| 312 |
+
if object_id not in self.kf_trackers:
|
| 313 |
+
self.kf_trackers[object_id] = self._init_kalman_filter(box_3d)
|
| 314 |
+
|
| 315 |
+
# Get the Kalman filter for this object
|
| 316 |
+
kf = self.kf_trackers[object_id]
|
| 317 |
+
|
| 318 |
+
# Predict
|
| 319 |
+
kf.predict()
|
| 320 |
+
|
| 321 |
+
# Update with measurement
|
| 322 |
+
measurement = np.array([
|
| 323 |
+
box_3d['location'][0],
|
| 324 |
+
box_3d['location'][1],
|
| 325 |
+
box_3d['location'][2],
|
| 326 |
+
box_3d['dimensions'][1], # width
|
| 327 |
+
box_3d['dimensions'][0], # height
|
| 328 |
+
box_3d['dimensions'][2], # length
|
| 329 |
+
box_3d['orientation']
|
| 330 |
+
])
|
| 331 |
+
|
| 332 |
+
kf.update(measurement)
|
| 333 |
+
|
| 334 |
+
# Update box_3d with filtered values
|
| 335 |
+
filtered_box = box_3d.copy()
|
| 336 |
+
filtered_box['location'] = np.array([kf.x[0], kf.x[1], kf.x[2]])
|
| 337 |
+
filtered_box['dimensions'] = np.array([kf.x[4], kf.x[3], kf.x[5]]) # height, width, length
|
| 338 |
+
filtered_box['orientation'] = kf.x[6]
|
| 339 |
+
|
| 340 |
+
return filtered_box
|
| 341 |
+
|
| 342 |
+
def _apply_temporal_filter(self, object_id):
|
| 343 |
+
"""
|
| 344 |
+
Apply temporal filtering to smooth 3D box parameters over time
|
| 345 |
+
|
| 346 |
+
Args:
|
| 347 |
+
object_id (int): Object ID for tracking
|
| 348 |
+
|
| 349 |
+
Returns:
|
| 350 |
+
dict: Temporally filtered 3D bounding box parameters
|
| 351 |
+
"""
|
| 352 |
+
history = self.box_history[object_id]
|
| 353 |
+
|
| 354 |
+
if len(history) < 2:
|
| 355 |
+
return history[-1]
|
| 356 |
+
|
| 357 |
+
# Get the most recent box
|
| 358 |
+
current_box = history[-1]
|
| 359 |
+
|
| 360 |
+
# Apply exponential moving average to location and orientation
|
| 361 |
+
alpha = 0.7 # Weight for current measurement (higher = less smoothing)
|
| 362 |
+
|
| 363 |
+
# Initialize with current values
|
| 364 |
+
filtered_box = current_box.copy()
|
| 365 |
+
|
| 366 |
+
# Apply EMA to location and orientation
|
| 367 |
+
for i in range(len(history) - 2, -1, -1):
|
| 368 |
+
weight = alpha * (1 - alpha) ** (len(history) - i - 2)
|
| 369 |
+
filtered_box['location'] = filtered_box['location'] * (1 - weight) + history[i]['location'] * weight
|
| 370 |
+
|
| 371 |
+
# Handle orientation wrapping
|
| 372 |
+
angle_diff = history[i]['orientation'] - filtered_box['orientation']
|
| 373 |
+
if angle_diff > np.pi:
|
| 374 |
+
angle_diff -= 2 * np.pi
|
| 375 |
+
elif angle_diff < -np.pi:
|
| 376 |
+
angle_diff += 2 * np.pi
|
| 377 |
+
|
| 378 |
+
filtered_box['orientation'] += angle_diff * weight
|
| 379 |
+
|
| 380 |
+
return filtered_box
|
| 381 |
+
|
| 382 |
+
def project_box_3d_to_2d(self, box_3d):
|
| 383 |
+
"""
|
| 384 |
+
Project 3D bounding box corners to 2D image space
|
| 385 |
+
|
| 386 |
+
Args:
|
| 387 |
+
box_3d (dict): 3D bounding box parameters
|
| 388 |
+
|
| 389 |
+
Returns:
|
| 390 |
+
numpy.ndarray: 2D points of the 3D box corners (8x2)
|
| 391 |
+
"""
|
| 392 |
+
# Extract parameters
|
| 393 |
+
h, w, l = box_3d['dimensions']
|
| 394 |
+
x, y, z = box_3d['location']
|
| 395 |
+
rot_y = box_3d['orientation']
|
| 396 |
+
class_name = box_3d['class_name'].lower()
|
| 397 |
+
|
| 398 |
+
# Get 2D box for reference
|
| 399 |
+
x1, y1, x2, y2 = box_3d['bbox_2d']
|
| 400 |
+
center_x = (x1 + x2) / 2
|
| 401 |
+
center_y = (y1 + y2) / 2
|
| 402 |
+
width_2d = x2 - x1
|
| 403 |
+
height_2d = y2 - y1
|
| 404 |
+
|
| 405 |
+
# Create rotation matrix
|
| 406 |
+
R_mat = np.array([
|
| 407 |
+
[np.cos(rot_y), 0, np.sin(rot_y)],
|
| 408 |
+
[0, 1, 0],
|
| 409 |
+
[-np.sin(rot_y), 0, np.cos(rot_y)]
|
| 410 |
+
])
|
| 411 |
+
|
| 412 |
+
# 3D bounding box corners
|
| 413 |
+
# For plants and stationary objects, make the box more centered
|
| 414 |
+
if 'plant' in class_name or 'potted plant' in class_name:
|
| 415 |
+
# For plants, center the box on the plant
|
| 416 |
+
x_corners = np.array([l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2])
|
| 417 |
+
y_corners = np.array([h/2, h/2, h/2, h/2, -h/2, -h/2, -h/2, -h/2]) # Center vertically
|
| 418 |
+
z_corners = np.array([w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2])
|
| 419 |
+
else:
|
| 420 |
+
# For other objects, use standard box configuration
|
| 421 |
+
x_corners = np.array([l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2])
|
| 422 |
+
y_corners = np.array([0, 0, 0, 0, -h, -h, -h, -h]) # Bottom at y=0
|
| 423 |
+
z_corners = np.array([w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2])
|
| 424 |
+
|
| 425 |
+
# Rotate and translate corners
|
| 426 |
+
corners_3d = np.vstack([x_corners, y_corners, z_corners])
|
| 427 |
+
corners_3d = R_mat @ corners_3d
|
| 428 |
+
corners_3d[0, :] += x
|
| 429 |
+
corners_3d[1, :] += y
|
| 430 |
+
corners_3d[2, :] += z
|
| 431 |
+
|
| 432 |
+
# Project to 2D
|
| 433 |
+
corners_3d_homo = np.vstack([corners_3d, np.ones((1, 8))])
|
| 434 |
+
corners_2d_homo = self.P @ corners_3d_homo
|
| 435 |
+
corners_2d = corners_2d_homo[:2, :] / corners_2d_homo[2, :]
|
| 436 |
+
|
| 437 |
+
# Constrain the 3D box to be within a reasonable distance of the 2D box
|
| 438 |
+
# This helps prevent wildly incorrect projections
|
| 439 |
+
mean_x = np.mean(corners_2d[0, :])
|
| 440 |
+
mean_y = np.mean(corners_2d[1, :])
|
| 441 |
+
|
| 442 |
+
# If the projected box is too far from the 2D box center, adjust it
|
| 443 |
+
if abs(mean_x - center_x) > width_2d or abs(mean_y - center_y) > height_2d:
|
| 444 |
+
# Shift the projected points to center on the 2D box
|
| 445 |
+
shift_x = center_x - mean_x
|
| 446 |
+
shift_y = center_y - mean_y
|
| 447 |
+
corners_2d[0, :] += shift_x
|
| 448 |
+
corners_2d[1, :] += shift_y
|
| 449 |
+
|
| 450 |
+
return corners_2d.T
|
| 451 |
+
|
| 452 |
+
def draw_box_3d(self, image, box_3d, color=(0, 255, 0), thickness=2):
|
| 453 |
+
"""
|
| 454 |
+
Draw enhanced 3D bounding box on image with better depth perception
|
| 455 |
+
|
| 456 |
+
Args:
|
| 457 |
+
image (numpy.ndarray): Image to draw on
|
| 458 |
+
box_3d (dict): 3D bounding box parameters
|
| 459 |
+
color (tuple): Color in BGR format
|
| 460 |
+
thickness (int): Line thickness
|
| 461 |
+
|
| 462 |
+
Returns:
|
| 463 |
+
numpy.ndarray: Image with 3D box drawn
|
| 464 |
+
"""
|
| 465 |
+
# Get 2D box coordinates
|
| 466 |
+
x1, y1, x2, y2 = [int(coord) for coord in box_3d['bbox_2d']]
|
| 467 |
+
|
| 468 |
+
# Get depth value for scaling
|
| 469 |
+
depth_value = box_3d.get('depth_value', 0.5)
|
| 470 |
+
|
| 471 |
+
# Calculate box dimensions
|
| 472 |
+
width = x2 - x1
|
| 473 |
+
height = y2 - y1
|
| 474 |
+
|
| 475 |
+
# Calculate the offset for the 3D effect (deeper objects have smaller offset)
|
| 476 |
+
# Inverse relationship with depth - closer objects have larger offset
|
| 477 |
+
offset_factor = 1.0 - depth_value
|
| 478 |
+
offset_x = int(width * 0.3 * offset_factor)
|
| 479 |
+
offset_y = int(height * 0.3 * offset_factor)
|
| 480 |
+
|
| 481 |
+
# Ensure minimum offset for visibility
|
| 482 |
+
offset_x = max(15, min(offset_x, 50))
|
| 483 |
+
offset_y = max(15, min(offset_y, 50))
|
| 484 |
+
|
| 485 |
+
# Create points for the 3D box
|
| 486 |
+
# Front face (the 2D bounding box)
|
| 487 |
+
front_tl = (x1, y1)
|
| 488 |
+
front_tr = (x2, y1)
|
| 489 |
+
front_br = (x2, y2)
|
| 490 |
+
front_bl = (x1, y2)
|
| 491 |
+
|
| 492 |
+
# Back face (offset by depth)
|
| 493 |
+
back_tl = (x1 + offset_x, y1 - offset_y)
|
| 494 |
+
back_tr = (x2 + offset_x, y1 - offset_y)
|
| 495 |
+
back_br = (x2 + offset_x, y2 - offset_y)
|
| 496 |
+
back_bl = (x1 + offset_x, y2 - offset_y)
|
| 497 |
+
|
| 498 |
+
# Create a slightly transparent copy of the image for the 3D effect
|
| 499 |
+
overlay = image.copy()
|
| 500 |
+
|
| 501 |
+
# Draw the front face (2D bounding box)
|
| 502 |
+
cv2.rectangle(image, front_tl, front_br, color, thickness)
|
| 503 |
+
|
| 504 |
+
# Draw the connecting lines between front and back faces
|
| 505 |
+
cv2.line(image, front_tl, back_tl, color, thickness)
|
| 506 |
+
cv2.line(image, front_tr, back_tr, color, thickness)
|
| 507 |
+
cv2.line(image, front_br, back_br, color, thickness)
|
| 508 |
+
cv2.line(image, front_bl, back_bl, color, thickness)
|
| 509 |
+
|
| 510 |
+
# Draw the back face
|
| 511 |
+
cv2.line(image, back_tl, back_tr, color, thickness)
|
| 512 |
+
cv2.line(image, back_tr, back_br, color, thickness)
|
| 513 |
+
cv2.line(image, back_br, back_bl, color, thickness)
|
| 514 |
+
cv2.line(image, back_bl, back_tl, color, thickness)
|
| 515 |
+
|
| 516 |
+
# Fill the top face with a semi-transparent color to enhance 3D effect
|
| 517 |
+
pts_top = np.array([front_tl, front_tr, back_tr, back_tl], np.int32)
|
| 518 |
+
pts_top = pts_top.reshape((-1, 1, 2))
|
| 519 |
+
cv2.fillPoly(overlay, [pts_top], color)
|
| 520 |
+
|
| 521 |
+
# Fill the right face with a semi-transparent color
|
| 522 |
+
pts_right = np.array([front_tr, front_br, back_br, back_tr], np.int32)
|
| 523 |
+
pts_right = pts_right.reshape((-1, 1, 2))
|
| 524 |
+
# Darken the right face color for better 3D effect
|
| 525 |
+
right_color = (int(color[0] * 0.7), int(color[1] * 0.7), int(color[2] * 0.7))
|
| 526 |
+
cv2.fillPoly(overlay, [pts_right], right_color)
|
| 527 |
+
|
| 528 |
+
# Apply the overlay with transparency
|
| 529 |
+
alpha = 0.3 # Transparency factor
|
| 530 |
+
cv2.addWeighted(overlay, alpha, image, 1 - alpha, 0, image)
|
| 531 |
+
|
| 532 |
+
# Get class name and object ID
|
| 533 |
+
class_name = box_3d['class_name']
|
| 534 |
+
obj_id = box_3d['object_id'] if 'object_id' in box_3d else None
|
| 535 |
+
|
| 536 |
+
# Draw text information
|
| 537 |
+
text_y = y1 - 10
|
| 538 |
+
if obj_id is not None:
|
| 539 |
+
cv2.putText(image, f"ID:{obj_id}", (x1, text_y),
|
| 540 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
| 541 |
+
text_y -= 15
|
| 542 |
+
|
| 543 |
+
cv2.putText(image, class_name, (x1, text_y),
|
| 544 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
| 545 |
+
text_y -= 15
|
| 546 |
+
|
| 547 |
+
# Get depth information if available
|
| 548 |
+
if 'depth_value' in box_3d:
|
| 549 |
+
depth_value = box_3d['depth_value']
|
| 550 |
+
depth_method = box_3d.get('depth_method', 'unknown')
|
| 551 |
+
depth_text = f"D:{depth_value:.2f} ({depth_method})"
|
| 552 |
+
cv2.putText(image, depth_text, (x1, text_y),
|
| 553 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
| 554 |
+
text_y -= 15
|
| 555 |
+
|
| 556 |
+
# Get score if available
|
| 557 |
+
if 'score' in box_3d:
|
| 558 |
+
score = box_3d['score']
|
| 559 |
+
score_text = f"S:{score:.2f}"
|
| 560 |
+
cv2.putText(image, score_text, (x1, text_y),
|
| 561 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
| 562 |
+
|
| 563 |
+
# Draw a vertical line from the bottom of the box to the ground
|
| 564 |
+
# This helps with depth perception
|
| 565 |
+
ground_y = y2 + int(height * 0.2) # A bit below the bottom of the box
|
| 566 |
+
cv2.line(image, (int((x1 + x2) / 2), y2), (int((x1 + x2) / 2), ground_y), color, thickness)
|
| 567 |
+
|
| 568 |
+
# Draw a small circle at the bottom to represent the ground contact point
|
| 569 |
+
cv2.circle(image, (int((x1 + x2) / 2), ground_y), thickness * 2, color, -1)
|
| 570 |
+
|
| 571 |
+
return image
|
| 572 |
+
|
| 573 |
+
def cleanup_trackers(self, active_ids):
|
| 574 |
+
"""
|
| 575 |
+
Clean up Kalman filters and history for objects that are no longer tracked
|
| 576 |
+
|
| 577 |
+
Args:
|
| 578 |
+
active_ids (list): List of active object IDs
|
| 579 |
+
"""
|
| 580 |
+
# Convert to set for faster lookup
|
| 581 |
+
active_ids_set = set(active_ids)
|
| 582 |
+
|
| 583 |
+
# Clean up Kalman filters
|
| 584 |
+
for obj_id in list(self.kf_trackers.keys()):
|
| 585 |
+
if obj_id not in active_ids_set:
|
| 586 |
+
del self.kf_trackers[obj_id]
|
| 587 |
+
|
| 588 |
+
# Clean up box history
|
| 589 |
+
for obj_id in list(self.box_history.keys()):
|
| 590 |
+
if obj_id not in active_ids_set:
|
| 591 |
+
del self.box_history[obj_id]
|
| 592 |
+
|
| 593 |
+
class BirdEyeView:
|
| 594 |
+
"""
|
| 595 |
+
Bird's Eye View visualization
|
| 596 |
+
"""
|
| 597 |
+
def __init__(self, size=(400, 400), scale=30, camera_height=1.2):
|
| 598 |
+
"""
|
| 599 |
+
Initialize the Bird's Eye View visualizer
|
| 600 |
+
|
| 601 |
+
Args:
|
| 602 |
+
size (tuple): Size of the BEV image (width, height)
|
| 603 |
+
scale (float): Scale factor (pixels per meter)
|
| 604 |
+
camera_height (float): Height of the camera above ground (meters)
|
| 605 |
+
"""
|
| 606 |
+
self.width, self.height = size
|
| 607 |
+
self.scale = scale
|
| 608 |
+
self.camera_height = camera_height
|
| 609 |
+
|
| 610 |
+
# Create empty BEV image
|
| 611 |
+
self.bev_image = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
| 612 |
+
|
| 613 |
+
# Set origin at the bottom center of the image
|
| 614 |
+
self.origin_x = self.width // 2
|
| 615 |
+
self.origin_y = self.height - 50
|
| 616 |
+
|
| 617 |
+
def reset(self):
|
| 618 |
+
"""
|
| 619 |
+
Reset the BEV image
|
| 620 |
+
"""
|
| 621 |
+
# Create a dark background
|
| 622 |
+
self.bev_image = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
| 623 |
+
self.bev_image[:, :] = (20, 20, 20) # Dark gray background
|
| 624 |
+
|
| 625 |
+
# Draw grid lines
|
| 626 |
+
grid_spacing = max(int(self.scale), 20) # At least 20 pixels between grid lines
|
| 627 |
+
|
| 628 |
+
# Draw horizontal grid lines
|
| 629 |
+
for y in range(self.origin_y, 0, -grid_spacing):
|
| 630 |
+
cv2.line(self.bev_image, (0, y), (self.width, y), (50, 50, 50), 1)
|
| 631 |
+
|
| 632 |
+
# Draw vertical grid lines
|
| 633 |
+
for x in range(0, self.width, grid_spacing):
|
| 634 |
+
cv2.line(self.bev_image, (x, 0), (x, self.height), (50, 50, 50), 1)
|
| 635 |
+
|
| 636 |
+
# Draw coordinate system
|
| 637 |
+
axis_length = min(80, self.height // 5)
|
| 638 |
+
|
| 639 |
+
# X-axis (upward)
|
| 640 |
+
cv2.line(self.bev_image,
|
| 641 |
+
(self.origin_x, self.origin_y),
|
| 642 |
+
(self.origin_x, self.origin_y - axis_length),
|
| 643 |
+
(0, 200, 0), 2) # Green for X-axis
|
| 644 |
+
|
| 645 |
+
# Y-axis (rightward)
|
| 646 |
+
cv2.line(self.bev_image,
|
| 647 |
+
(self.origin_x, self.origin_y),
|
| 648 |
+
(self.origin_x + axis_length, self.origin_y),
|
| 649 |
+
(0, 0, 200), 2) # Red for Y-axis
|
| 650 |
+
|
| 651 |
+
# Add axis labels
|
| 652 |
+
cv2.putText(self.bev_image, "X",
|
| 653 |
+
(self.origin_x - 15, self.origin_y - axis_length + 15),
|
| 654 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 200, 0), 1)
|
| 655 |
+
|
| 656 |
+
cv2.putText(self.bev_image, "Y",
|
| 657 |
+
(self.origin_x + axis_length - 15, self.origin_y + 20),
|
| 658 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 200), 1)
|
| 659 |
+
|
| 660 |
+
# Draw distance markers specifically for 1-5 meter range
|
| 661 |
+
# Use fixed steps of 1 meter with intermediate markers at 0.5 meters
|
| 662 |
+
for dist in [1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5]:
|
| 663 |
+
y = self.origin_y - int(dist * self.scale)
|
| 664 |
+
|
| 665 |
+
if y < 20: # Skip if too close to top
|
| 666 |
+
continue
|
| 667 |
+
|
| 668 |
+
# Draw tick mark - thicker for whole meters
|
| 669 |
+
thickness = 2 if dist.is_integer() else 1
|
| 670 |
+
cv2.line(self.bev_image,
|
| 671 |
+
(self.origin_x - 5, y),
|
| 672 |
+
(self.origin_x + 5, y),
|
| 673 |
+
(120, 120, 120), thickness)
|
| 674 |
+
|
| 675 |
+
# Only show text for whole meters
|
| 676 |
+
if dist.is_integer():
|
| 677 |
+
cv2.putText(self.bev_image, f"{int(dist)}m",
|
| 678 |
+
(self.origin_x + 10, y + 4),
|
| 679 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (180, 180, 180), 1)
|
| 680 |
+
|
| 681 |
+
def draw_box(self, box_3d, color=None):
|
| 682 |
+
"""
|
| 683 |
+
Draw a more realistic representation of an object on the BEV image
|
| 684 |
+
|
| 685 |
+
Args:
|
| 686 |
+
box_3d (dict): 3D bounding box parameters
|
| 687 |
+
color (tuple): Color in BGR format (None for automatic color based on class)
|
| 688 |
+
"""
|
| 689 |
+
try:
|
| 690 |
+
# Extract parameters
|
| 691 |
+
class_name = box_3d['class_name'].lower()
|
| 692 |
+
|
| 693 |
+
# Scale depth to fit within 1-5 meters range
|
| 694 |
+
depth_value = box_3d.get('depth_value', 0.5)
|
| 695 |
+
# Map depth value (0-1) to a range of 1-5 meters
|
| 696 |
+
depth = 1.0 + depth_value * 4.0
|
| 697 |
+
|
| 698 |
+
# Get 2D box dimensions for size estimation
|
| 699 |
+
if 'bbox_2d' in box_3d:
|
| 700 |
+
x1, y1, x2, y2 = box_3d['bbox_2d']
|
| 701 |
+
width_2d = x2 - x1
|
| 702 |
+
height_2d = y2 - y1
|
| 703 |
+
size_factor = width_2d / 100
|
| 704 |
+
size_factor = max(0.5, min(size_factor, 2.0))
|
| 705 |
+
else:
|
| 706 |
+
size_factor = 1.0
|
| 707 |
+
|
| 708 |
+
# Determine color based on class
|
| 709 |
+
if color is None:
|
| 710 |
+
if 'car' in class_name or 'vehicle' in class_name:
|
| 711 |
+
color = (0, 0, 255) # Red
|
| 712 |
+
elif 'truck' in class_name or 'bus' in class_name:
|
| 713 |
+
color = (0, 165, 255) # Orange
|
| 714 |
+
elif 'person' in class_name:
|
| 715 |
+
color = (0, 255, 0) # Green
|
| 716 |
+
elif 'bicycle' in class_name or 'motorcycle' in class_name:
|
| 717 |
+
color = (255, 0, 0) # Blue
|
| 718 |
+
elif 'potted plant' in class_name or 'plant' in class_name:
|
| 719 |
+
color = (0, 255, 255) # Yellow
|
| 720 |
+
else:
|
| 721 |
+
color = (255, 255, 255) # White
|
| 722 |
+
|
| 723 |
+
# Get object ID if available
|
| 724 |
+
obj_id = box_3d.get('object_id', None)
|
| 725 |
+
|
| 726 |
+
# Calculate position in BEV with flipped axes
|
| 727 |
+
# X-axis points upward, Y-axis points rightward
|
| 728 |
+
|
| 729 |
+
# Calculate Y position (upward) based on depth
|
| 730 |
+
bev_y = self.origin_y - int(depth * self.scale)
|
| 731 |
+
|
| 732 |
+
# Calculate X position (rightward) based on horizontal position in image
|
| 733 |
+
if 'bbox_2d' in box_3d:
|
| 734 |
+
center_x_2d = (x1 + x2) / 2
|
| 735 |
+
image_width = self.bev_image.shape[1]
|
| 736 |
+
rel_x = (center_x_2d / image_width) - 0.5
|
| 737 |
+
bev_x = self.origin_x + int(rel_x * self.width * 0.6)
|
| 738 |
+
else:
|
| 739 |
+
bev_x = self.origin_x
|
| 740 |
+
|
| 741 |
+
# Ensure the object stays within the visible area
|
| 742 |
+
bev_x = max(20, min(bev_x, self.width - 20))
|
| 743 |
+
bev_y = max(20, min(bev_y, self.origin_y - 10))
|
| 744 |
+
|
| 745 |
+
# Draw object based on type
|
| 746 |
+
if 'person' in class_name:
|
| 747 |
+
# Draw person as a circle
|
| 748 |
+
radius = int(4 * size_factor)
|
| 749 |
+
cv2.circle(self.bev_image, (bev_x, bev_y), radius, color, -1)
|
| 750 |
+
|
| 751 |
+
elif 'car' in class_name or 'vehicle' in class_name or 'truck' in class_name or 'bus' in class_name:
|
| 752 |
+
# Draw vehicle as a rectangle
|
| 753 |
+
rect_width = int(12 * size_factor)
|
| 754 |
+
rect_length = int(18 * size_factor)
|
| 755 |
+
|
| 756 |
+
if 'truck' in class_name or 'bus' in class_name:
|
| 757 |
+
rect_length = int(24 * size_factor) # Longer for trucks/buses
|
| 758 |
+
|
| 759 |
+
# Draw vehicle body
|
| 760 |
+
cv2.rectangle(self.bev_image,
|
| 761 |
+
(bev_x - rect_width//2, bev_y - rect_length//2),
|
| 762 |
+
(bev_x + rect_width//2, bev_y + rect_length//2),
|
| 763 |
+
color, -1)
|
| 764 |
+
|
| 765 |
+
elif 'plant' in class_name or 'potted plant' in class_name:
|
| 766 |
+
# Draw plant as a circle
|
| 767 |
+
radius = int(8 * size_factor)
|
| 768 |
+
cv2.circle(self.bev_image, (bev_x, bev_y), radius, color, -1)
|
| 769 |
+
|
| 770 |
+
else:
|
| 771 |
+
# Default: draw a square for other objects
|
| 772 |
+
size = int(8 * size_factor)
|
| 773 |
+
cv2.rectangle(self.bev_image,
|
| 774 |
+
(bev_x - size, bev_y - size),
|
| 775 |
+
(bev_x + size, bev_y + size),
|
| 776 |
+
color, -1)
|
| 777 |
+
|
| 778 |
+
# Draw object ID if available
|
| 779 |
+
if obj_id is not None:
|
| 780 |
+
cv2.putText(self.bev_image, f"{obj_id}",
|
| 781 |
+
(bev_x - 5, bev_y - 5),
|
| 782 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
|
| 783 |
+
|
| 784 |
+
# Draw distance line from origin to object
|
| 785 |
+
cv2.line(self.bev_image,
|
| 786 |
+
(self.origin_x, self.origin_y),
|
| 787 |
+
(bev_x, bev_y),
|
| 788 |
+
(70, 70, 70), 1)
|
| 789 |
+
except Exception as e:
|
| 790 |
+
print(f"Error drawing box in BEV: {e}")
|
| 791 |
+
|
| 792 |
+
def get_image(self):
|
| 793 |
+
"""
|
| 794 |
+
Get the BEV image
|
| 795 |
+
|
| 796 |
+
Returns:
|
| 797 |
+
numpy.ndarray: BEV image
|
| 798 |
+
"""
|
| 799 |
+
return self.bev_image
|
depth_model.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
import numpy as np
|
| 6 |
+
import cv2
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
class DepthEstimator:
|
| 11 |
+
"""
|
| 12 |
+
Depth estimation using Depth Anything v2
|
| 13 |
+
"""
|
| 14 |
+
def __init__(self, model_size='small', device=None):
|
| 15 |
+
"""
|
| 16 |
+
Initialize the depth estimator
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
model_size (str): Model size ('small', 'base', 'large')
|
| 20 |
+
device (str): Device to run inference on ('cuda', 'cpu', 'mps')
|
| 21 |
+
"""
|
| 22 |
+
# Determine device
|
| 23 |
+
if device is None:
|
| 24 |
+
if torch.cuda.is_available():
|
| 25 |
+
device = 'cuda'
|
| 26 |
+
elif hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 27 |
+
device = 'mps'
|
| 28 |
+
else:
|
| 29 |
+
device = 'cpu'
|
| 30 |
+
|
| 31 |
+
self.device = device
|
| 32 |
+
|
| 33 |
+
# Set MPS fallback for operations not supported on Apple Silicon
|
| 34 |
+
if self.device == 'mps':
|
| 35 |
+
print("Using MPS device with CPU fallback for unsupported operations")
|
| 36 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
| 37 |
+
# For Depth Anything v2, we'll use CPU directly due to MPS compatibility issues
|
| 38 |
+
self.pipe_device = 'cpu'
|
| 39 |
+
print("Forcing CPU for depth estimation pipeline due to MPS compatibility issues")
|
| 40 |
+
else:
|
| 41 |
+
self.pipe_device = self.device
|
| 42 |
+
|
| 43 |
+
print(f"Using device: {self.device} for depth estimation (pipeline on {self.pipe_device})")
|
| 44 |
+
|
| 45 |
+
# Map model size to model name
|
| 46 |
+
model_map = {
|
| 47 |
+
'small': 'depth-anything/Depth-Anything-V2-Small-hf',
|
| 48 |
+
'base': 'depth-anything/Depth-Anything-V2-Base-hf',
|
| 49 |
+
'large': 'depth-anything/Depth-Anything-V2-Large-hf'
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
model_name = model_map.get(model_size.lower(), model_map['small'])
|
| 53 |
+
|
| 54 |
+
# Create pipeline
|
| 55 |
+
try:
|
| 56 |
+
self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
|
| 57 |
+
print(f"Loaded Depth Anything v2 {model_size} model on {self.pipe_device}")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
# Fallback to CPU if there are issues
|
| 60 |
+
print(f"Error loading model on {self.pipe_device}: {e}")
|
| 61 |
+
print("Falling back to CPU for depth estimation")
|
| 62 |
+
self.pipe_device = 'cpu'
|
| 63 |
+
self.pipe = pipeline(task="depth-estimation", model=model_name, device=self.pipe_device)
|
| 64 |
+
print(f"Loaded Depth Anything v2 {model_size} model on CPU (fallback)")
|
| 65 |
+
|
| 66 |
+
def estimate_depth(self, image):
|
| 67 |
+
"""
|
| 68 |
+
Estimate depth from an image
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
image (numpy.ndarray): Input image (BGR format)
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
numpy.ndarray: Depth map (normalized to 0-1)
|
| 75 |
+
"""
|
| 76 |
+
# Convert BGR to RGB
|
| 77 |
+
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| 78 |
+
|
| 79 |
+
# Convert to PIL Image
|
| 80 |
+
pil_image = Image.fromarray(image_rgb)
|
| 81 |
+
|
| 82 |
+
# Get depth map
|
| 83 |
+
try:
|
| 84 |
+
depth_result = self.pipe(pil_image)
|
| 85 |
+
depth_map = depth_result["depth"]
|
| 86 |
+
|
| 87 |
+
# Convert PIL Image to numpy array if needed
|
| 88 |
+
if isinstance(depth_map, Image.Image):
|
| 89 |
+
depth_map = np.array(depth_map)
|
| 90 |
+
elif isinstance(depth_map, torch.Tensor):
|
| 91 |
+
depth_map = depth_map.cpu().numpy()
|
| 92 |
+
except RuntimeError as e:
|
| 93 |
+
# Handle potential MPS errors during inference
|
| 94 |
+
if self.device == 'mps':
|
| 95 |
+
print(f"MPS error during depth estimation: {e}")
|
| 96 |
+
print("Temporarily falling back to CPU for this frame")
|
| 97 |
+
# Create a CPU pipeline for this frame
|
| 98 |
+
cpu_pipe = pipeline(task="depth-estimation", model=self.pipe.model.config._name_or_path, device='cpu')
|
| 99 |
+
depth_result = cpu_pipe(pil_image)
|
| 100 |
+
depth_map = depth_result["depth"]
|
| 101 |
+
|
| 102 |
+
# Convert PIL Image to numpy array if needed
|
| 103 |
+
if isinstance(depth_map, Image.Image):
|
| 104 |
+
depth_map = np.array(depth_map)
|
| 105 |
+
elif isinstance(depth_map, torch.Tensor):
|
| 106 |
+
depth_map = depth_map.cpu().numpy()
|
| 107 |
+
else:
|
| 108 |
+
# Re-raise the error if not MPS
|
| 109 |
+
raise
|
| 110 |
+
|
| 111 |
+
# Normalize depth map to 0-1
|
| 112 |
+
depth_min = depth_map.min()
|
| 113 |
+
depth_max = depth_map.max()
|
| 114 |
+
if depth_max > depth_min:
|
| 115 |
+
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
|
| 116 |
+
|
| 117 |
+
return depth_map
|
| 118 |
+
|
| 119 |
+
def colorize_depth(self, depth_map, cmap=cv2.COLORMAP_INFERNO):
|
| 120 |
+
"""
|
| 121 |
+
Colorize depth map for visualization
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
depth_map (numpy.ndarray): Depth map (normalized to 0-1)
|
| 125 |
+
cmap (int): OpenCV colormap
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
numpy.ndarray: Colorized depth map (BGR format)
|
| 129 |
+
"""
|
| 130 |
+
depth_map_uint8 = (depth_map * 255).astype(np.uint8)
|
| 131 |
+
colored_depth = cv2.applyColorMap(depth_map_uint8, cmap)
|
| 132 |
+
return colored_depth
|
| 133 |
+
|
| 134 |
+
def get_depth_at_point(self, depth_map, x, y):
|
| 135 |
+
"""
|
| 136 |
+
Get depth value at a specific point
|
| 137 |
+
|
| 138 |
+
Args:
|
| 139 |
+
depth_map (numpy.ndarray): Depth map
|
| 140 |
+
x (int): X coordinate
|
| 141 |
+
y (int): Y coordinate
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
float: Depth value at (x, y)
|
| 145 |
+
"""
|
| 146 |
+
if 0 <= y < depth_map.shape[0] and 0 <= x < depth_map.shape[1]:
|
| 147 |
+
return depth_map[y, x]
|
| 148 |
+
return 0.0
|
| 149 |
+
|
| 150 |
+
def get_depth_in_region(self, depth_map, bbox, method='median'):
|
| 151 |
+
"""
|
| 152 |
+
Get depth value in a region defined by a bounding box
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
depth_map (numpy.ndarray): Depth map
|
| 156 |
+
bbox (list): Bounding box [x1, y1, x2, y2]
|
| 157 |
+
method (str): Method to compute depth ('median', 'mean', 'min')
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
float: Depth value in the region
|
| 161 |
+
"""
|
| 162 |
+
x1, y1, x2, y2 = [int(coord) for coord in bbox]
|
| 163 |
+
|
| 164 |
+
# Ensure coordinates are within image bounds
|
| 165 |
+
x1 = max(0, x1)
|
| 166 |
+
y1 = max(0, y1)
|
| 167 |
+
x2 = min(depth_map.shape[1] - 1, x2)
|
| 168 |
+
y2 = min(depth_map.shape[0] - 1, y2)
|
| 169 |
+
|
| 170 |
+
# Extract region
|
| 171 |
+
region = depth_map[y1:y2, x1:x2]
|
| 172 |
+
|
| 173 |
+
if region.size == 0:
|
| 174 |
+
return 0.0
|
| 175 |
+
|
| 176 |
+
# Compute depth based on method
|
| 177 |
+
if method == 'median':
|
| 178 |
+
return float(np.median(region))
|
| 179 |
+
elif method == 'mean':
|
| 180 |
+
return float(np.mean(region))
|
| 181 |
+
elif method == 'min':
|
| 182 |
+
return float(np.min(region))
|
| 183 |
+
else:
|
| 184 |
+
return float(np.median(region))
|
detection_model.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import numpy as np
|
| 4 |
+
import cv2
|
| 5 |
+
from ultralytics import YOLO
|
| 6 |
+
from collections import deque
|
| 7 |
+
|
| 8 |
+
class ObjectDetector:
|
| 9 |
+
"""
|
| 10 |
+
Object detection using YOLOv11 from Ultralytics
|
| 11 |
+
"""
|
| 12 |
+
def __init__(self, model_size='small', conf_thres=0.25, iou_thres=0.45, classes=None, device=None):
|
| 13 |
+
"""
|
| 14 |
+
Initialize the object detector
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
model_size (str): Model size ('nano', 'small', 'medium', 'large', 'extra')
|
| 18 |
+
conf_thres (float): Confidence threshold for detections
|
| 19 |
+
iou_thres (float): IoU threshold for NMS
|
| 20 |
+
classes (list): List of classes to detect (None for all classes)
|
| 21 |
+
device (str): Device to run inference on ('cuda', 'cpu', 'mps')
|
| 22 |
+
"""
|
| 23 |
+
# Determine device
|
| 24 |
+
if device is None:
|
| 25 |
+
if torch.cuda.is_available():
|
| 26 |
+
device = 'cuda'
|
| 27 |
+
elif hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 28 |
+
device = 'mps'
|
| 29 |
+
else:
|
| 30 |
+
device = 'cpu'
|
| 31 |
+
|
| 32 |
+
self.device = device
|
| 33 |
+
|
| 34 |
+
# Set MPS fallback for operations not supported on Apple Silicon
|
| 35 |
+
if self.device == 'mps':
|
| 36 |
+
print("Using MPS device with CPU fallback for unsupported operations")
|
| 37 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
| 38 |
+
|
| 39 |
+
print(f"Using device: {self.device} for object detection")
|
| 40 |
+
|
| 41 |
+
# Map model size to model name
|
| 42 |
+
model_map = {
|
| 43 |
+
'nano': 'yolo11n',
|
| 44 |
+
'small': 'yolo11s',
|
| 45 |
+
'medium': 'yolo11m',
|
| 46 |
+
'large': 'yolo11l',
|
| 47 |
+
'extra': 'yolo11x'
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
model_name = model_map.get(model_size.lower(), model_map['small'])
|
| 51 |
+
|
| 52 |
+
# Load model
|
| 53 |
+
try:
|
| 54 |
+
self.model = YOLO(model_name)
|
| 55 |
+
print(f"Loaded YOLOv11 {model_size} model on {self.device}")
|
| 56 |
+
except Exception as e:
|
| 57 |
+
print(f"Error loading model: {e}")
|
| 58 |
+
print("Trying to load with default settings...")
|
| 59 |
+
self.model = YOLO(model_name)
|
| 60 |
+
|
| 61 |
+
# Set model parameters
|
| 62 |
+
self.model.overrides['conf'] = conf_thres
|
| 63 |
+
self.model.overrides['iou'] = iou_thres
|
| 64 |
+
self.model.overrides['agnostic_nms'] = False
|
| 65 |
+
self.model.overrides['max_det'] = 1000
|
| 66 |
+
|
| 67 |
+
if classes is not None:
|
| 68 |
+
self.model.overrides['classes'] = classes
|
| 69 |
+
|
| 70 |
+
# Initialize tracking trajectories
|
| 71 |
+
self.tracking_trajectories = {}
|
| 72 |
+
|
| 73 |
+
def detect(self, image, track=True):
|
| 74 |
+
"""
|
| 75 |
+
Detect objects in an image
|
| 76 |
+
|
| 77 |
+
Args:
|
| 78 |
+
image (numpy.ndarray): Input image (BGR format)
|
| 79 |
+
track (bool): Whether to track objects across frames
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
tuple: (annotated_image, detections)
|
| 83 |
+
- annotated_image (numpy.ndarray): Image with detections drawn
|
| 84 |
+
- detections (list): List of detections [bbox, score, class_id, object_id]
|
| 85 |
+
"""
|
| 86 |
+
detections = []
|
| 87 |
+
|
| 88 |
+
# Make a copy of the image for annotation
|
| 89 |
+
annotated_image = image.copy()
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
if track:
|
| 93 |
+
# Run inference with tracking
|
| 94 |
+
results = self.model.track(image, verbose=False, device=self.device, persist=True)
|
| 95 |
+
else:
|
| 96 |
+
# Run inference without tracking
|
| 97 |
+
results = self.model.predict(image, verbose=False, device=self.device)
|
| 98 |
+
except RuntimeError as e:
|
| 99 |
+
# Handle potential MPS errors
|
| 100 |
+
if self.device == 'mps' and "not currently implemented for the MPS device" in str(e):
|
| 101 |
+
print(f"MPS error during detection: {e}")
|
| 102 |
+
print("Falling back to CPU for this frame")
|
| 103 |
+
if track:
|
| 104 |
+
results = self.model.track(image, verbose=False, device='cpu', persist=True)
|
| 105 |
+
else:
|
| 106 |
+
results = self.model.predict(image, verbose=False, device='cpu')
|
| 107 |
+
else:
|
| 108 |
+
# Re-raise the error if not MPS or not an implementation error
|
| 109 |
+
raise
|
| 110 |
+
|
| 111 |
+
if track:
|
| 112 |
+
# Clean up trajectories for objects that are no longer tracked
|
| 113 |
+
for id_ in list(self.tracking_trajectories.keys()):
|
| 114 |
+
if id_ not in [int(bbox.id) for predictions in results if predictions is not None
|
| 115 |
+
for bbox in predictions.boxes if bbox.id is not None]:
|
| 116 |
+
del self.tracking_trajectories[id_]
|
| 117 |
+
|
| 118 |
+
# Process results
|
| 119 |
+
for predictions in results:
|
| 120 |
+
if predictions is None:
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
if predictions.boxes is None:
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
# Process boxes
|
| 127 |
+
for bbox in predictions.boxes:
|
| 128 |
+
# Extract information
|
| 129 |
+
scores = bbox.conf
|
| 130 |
+
classes = bbox.cls
|
| 131 |
+
bbox_coords = bbox.xyxy
|
| 132 |
+
|
| 133 |
+
# Check if tracking IDs are available
|
| 134 |
+
if hasattr(bbox, 'id') and bbox.id is not None:
|
| 135 |
+
ids = bbox.id
|
| 136 |
+
else:
|
| 137 |
+
ids = [None] * len(scores)
|
| 138 |
+
|
| 139 |
+
# Process each detection
|
| 140 |
+
for score, class_id, bbox_coord, id_ in zip(scores, classes, bbox_coords, ids):
|
| 141 |
+
xmin, ymin, xmax, ymax = bbox_coord.cpu().numpy()
|
| 142 |
+
|
| 143 |
+
# Add to detections list
|
| 144 |
+
detections.append([
|
| 145 |
+
[xmin, ymin, xmax, ymax], # bbox
|
| 146 |
+
float(score), # confidence score
|
| 147 |
+
int(class_id), # class id
|
| 148 |
+
int(id_) if id_ is not None else None # object id
|
| 149 |
+
])
|
| 150 |
+
|
| 151 |
+
# Draw bounding box
|
| 152 |
+
cv2.rectangle(annotated_image,
|
| 153 |
+
(int(xmin), int(ymin)),
|
| 154 |
+
(int(xmax), int(ymax)),
|
| 155 |
+
(0, 0, 225), 2)
|
| 156 |
+
|
| 157 |
+
# Add label
|
| 158 |
+
label = f"ID: {int(id_) if id_ is not None else 'N/A'} {predictions.names[int(class_id)]} {float(score):.2f}"
|
| 159 |
+
text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
| 160 |
+
dim, baseline = text_size[0], text_size[1]
|
| 161 |
+
cv2.rectangle(annotated_image,
|
| 162 |
+
(int(xmin), int(ymin)),
|
| 163 |
+
(int(xmin) + dim[0], int(ymin) - dim[1] - baseline),
|
| 164 |
+
(30, 30, 30), cv2.FILLED)
|
| 165 |
+
cv2.putText(annotated_image, label,
|
| 166 |
+
(int(xmin), int(ymin) - 7),
|
| 167 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
| 168 |
+
|
| 169 |
+
# Update tracking trajectories
|
| 170 |
+
if id_ is not None:
|
| 171 |
+
centroid_x = (xmin + xmax) / 2
|
| 172 |
+
centroid_y = (ymin + ymax) / 2
|
| 173 |
+
|
| 174 |
+
if int(id_) not in self.tracking_trajectories:
|
| 175 |
+
self.tracking_trajectories[int(id_)] = deque(maxlen=10)
|
| 176 |
+
|
| 177 |
+
self.tracking_trajectories[int(id_)].append((centroid_x, centroid_y))
|
| 178 |
+
|
| 179 |
+
# Draw trajectories
|
| 180 |
+
for id_, trajectory in self.tracking_trajectories.items():
|
| 181 |
+
for i in range(1, len(trajectory)):
|
| 182 |
+
thickness = int(2 * (i / len(trajectory)) + 1)
|
| 183 |
+
cv2.line(annotated_image,
|
| 184 |
+
(int(trajectory[i-1][0]), int(trajectory[i-1][1])),
|
| 185 |
+
(int(trajectory[i][0]), int(trajectory[i][1])),
|
| 186 |
+
(255, 255, 255), thickness)
|
| 187 |
+
|
| 188 |
+
else:
|
| 189 |
+
# Process results for non-tracking mode
|
| 190 |
+
for predictions in results:
|
| 191 |
+
if predictions is None:
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
if predictions.boxes is None:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
# Process boxes
|
| 198 |
+
for bbox in predictions.boxes:
|
| 199 |
+
# Extract information
|
| 200 |
+
scores = bbox.conf
|
| 201 |
+
classes = bbox.cls
|
| 202 |
+
bbox_coords = bbox.xyxy
|
| 203 |
+
|
| 204 |
+
# Process each detection
|
| 205 |
+
for score, class_id, bbox_coord in zip(scores, classes, bbox_coords):
|
| 206 |
+
xmin, ymin, xmax, ymax = bbox_coord.cpu().numpy()
|
| 207 |
+
|
| 208 |
+
# Add to detections list
|
| 209 |
+
detections.append([
|
| 210 |
+
[xmin, ymin, xmax, ymax], # bbox
|
| 211 |
+
float(score), # confidence score
|
| 212 |
+
int(class_id), # class id
|
| 213 |
+
None # object id (None for no tracking)
|
| 214 |
+
])
|
| 215 |
+
|
| 216 |
+
# Draw bounding box
|
| 217 |
+
cv2.rectangle(annotated_image,
|
| 218 |
+
(int(xmin), int(ymin)),
|
| 219 |
+
(int(xmax), int(ymax)),
|
| 220 |
+
(0, 0, 225), 2)
|
| 221 |
+
|
| 222 |
+
# Add label
|
| 223 |
+
label = f"{predictions.names[int(class_id)]} {float(score):.2f}"
|
| 224 |
+
text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
|
| 225 |
+
dim, baseline = text_size[0], text_size[1]
|
| 226 |
+
cv2.rectangle(annotated_image,
|
| 227 |
+
(int(xmin), int(ymin)),
|
| 228 |
+
(int(xmin) + dim[0], int(ymin) - dim[1] - baseline),
|
| 229 |
+
(30, 30, 30), cv2.FILLED)
|
| 230 |
+
cv2.putText(annotated_image, label,
|
| 231 |
+
(int(xmin), int(ymin) - 7),
|
| 232 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
| 233 |
+
|
| 234 |
+
return annotated_image, detections
|
| 235 |
+
|
| 236 |
+
def get_class_names(self):
|
| 237 |
+
"""
|
| 238 |
+
Get the names of the classes that the model can detect
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
list: List of class names
|
| 242 |
+
"""
|
| 243 |
+
return self.model.names
|
load_camera_params.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import numpy as np
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
def load_camera_params(params_file):
|
| 8 |
+
"""
|
| 9 |
+
Load camera parameters from a JSON file.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
params_file (str): Path to the JSON file containing camera parameters
|
| 13 |
+
|
| 14 |
+
Returns:
|
| 15 |
+
dict: Dictionary containing camera parameters
|
| 16 |
+
"""
|
| 17 |
+
if not os.path.exists(params_file):
|
| 18 |
+
print(f"Warning: Camera parameters file {params_file} not found. Using default parameters.")
|
| 19 |
+
return None
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
with open(params_file, 'r') as f:
|
| 23 |
+
params = json.load(f)
|
| 24 |
+
|
| 25 |
+
# Convert lists to numpy arrays
|
| 26 |
+
params['camera_matrix'] = np.array(params['camera_matrix'])
|
| 27 |
+
params['dist_coeffs'] = np.array(params['dist_coeffs'])
|
| 28 |
+
params['projection_matrix'] = np.array(params['projection_matrix'])
|
| 29 |
+
|
| 30 |
+
print(f"Loaded camera parameters from {params_file}")
|
| 31 |
+
print(f"Camera matrix:\n{params['camera_matrix']}")
|
| 32 |
+
print(f"Projection matrix:\n{params['projection_matrix']}")
|
| 33 |
+
|
| 34 |
+
return params
|
| 35 |
+
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"Error loading camera parameters: {e}")
|
| 38 |
+
return None
|
| 39 |
+
|
| 40 |
+
def create_projection_matrix(camera_matrix, R=None, t=None):
|
| 41 |
+
"""
|
| 42 |
+
Create a projection matrix from camera intrinsic and extrinsic parameters.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
camera_matrix (numpy.ndarray): Camera intrinsic matrix (3x3)
|
| 46 |
+
R (numpy.ndarray): Rotation matrix (3x3)
|
| 47 |
+
t (numpy.ndarray): Translation vector (3x1)
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
numpy.ndarray: Projection matrix (3x4)
|
| 51 |
+
"""
|
| 52 |
+
if R is None:
|
| 53 |
+
R = np.eye(3)
|
| 54 |
+
|
| 55 |
+
if t is None:
|
| 56 |
+
t = np.zeros((3, 1))
|
| 57 |
+
|
| 58 |
+
# Combine rotation and translation
|
| 59 |
+
RT = np.hstack((R, t))
|
| 60 |
+
|
| 61 |
+
# Create projection matrix
|
| 62 |
+
projection_matrix = camera_matrix @ RT
|
| 63 |
+
|
| 64 |
+
return projection_matrix
|
| 65 |
+
|
| 66 |
+
def apply_camera_params_to_estimator(bbox3d_estimator, params):
|
| 67 |
+
"""
|
| 68 |
+
Apply camera parameters to a 3D bounding box estimator.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
bbox3d_estimator: BBox3DEstimator instance
|
| 72 |
+
params (dict): Dictionary containing camera parameters
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
bbox3d_estimator: Updated BBox3DEstimator instance
|
| 76 |
+
"""
|
| 77 |
+
if params is None:
|
| 78 |
+
print("Warning: No camera parameters provided. Using default parameters.")
|
| 79 |
+
return bbox3d_estimator
|
| 80 |
+
|
| 81 |
+
# Update camera matrix
|
| 82 |
+
if 'camera_matrix' in params:
|
| 83 |
+
bbox3d_estimator.K = params['camera_matrix']
|
| 84 |
+
|
| 85 |
+
# Update projection matrix
|
| 86 |
+
if 'projection_matrix' in params:
|
| 87 |
+
bbox3d_estimator.P = params['projection_matrix']
|
| 88 |
+
|
| 89 |
+
print("Applied camera parameters to 3D bounding box estimator")
|
| 90 |
+
|
| 91 |
+
return bbox3d_estimator
|
| 92 |
+
|
| 93 |
+
def main():
|
| 94 |
+
"""Example usage of the camera parameter functions."""
|
| 95 |
+
# Configuration variables (modify these as needed)
|
| 96 |
+
# ===============================================
|
| 97 |
+
|
| 98 |
+
# Input file
|
| 99 |
+
params_file = "camera_params.json" # Path to camera parameters JSON file
|
| 100 |
+
|
| 101 |
+
# Camera position (for example purposes)
|
| 102 |
+
camera_height = 1.65 # Camera height above ground in meters
|
| 103 |
+
# ===============================================
|
| 104 |
+
|
| 105 |
+
# Load camera parameters
|
| 106 |
+
params = load_camera_params(params_file)
|
| 107 |
+
|
| 108 |
+
if params:
|
| 109 |
+
print("\nCamera Parameters:")
|
| 110 |
+
print(f"Image dimensions: {params['image_width']}x{params['image_height']}")
|
| 111 |
+
print(f"Reprojection error: {params['reprojection_error']}")
|
| 112 |
+
|
| 113 |
+
# Example of creating a projection matrix with different extrinsic parameters
|
| 114 |
+
print(f"\nExample: Creating a projection matrix with camera raised {camera_height}m above ground")
|
| 115 |
+
R = np.eye(3)
|
| 116 |
+
t = np.array([[0], [camera_height], [0]]) # Camera above ground
|
| 117 |
+
|
| 118 |
+
projection_matrix = create_projection_matrix(params['camera_matrix'], R, t)
|
| 119 |
+
print(f"New projection matrix:\n{projection_matrix}")
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
torchvision>=0.15.0
|
| 3 |
+
opencv-python>=4.7.0
|
| 4 |
+
numpy>=1.22.0
|
| 5 |
+
ultralytics>=8.0.0 # For YOLOv11
|
| 6 |
+
timm>=0.9.2 # Required for Depth Anything v2
|
| 7 |
+
matplotlib>=3.7.0
|
| 8 |
+
pillow>=9.4.0
|
| 9 |
+
tqdm>=4.65.0
|
| 10 |
+
scipy>=1.10.0
|
| 11 |
+
filterpy>=1.4.5 # For Kalman filtering in tracking
|
| 12 |
+
lap>=0.4.0 # For Hungarian algorithm in tracking
|
| 13 |
+
scikit-image>=0.20.0
|
| 14 |
+
pyyaml>=6.0
|
| 15 |
+
requests>=2.28.0
|
run.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import time
|
| 5 |
+
import cv2
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Set MPS fallback for operations not supported on Apple Silicon
|
| 11 |
+
if hasattr(torch, 'backends') and hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
| 12 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
| 13 |
+
|
| 14 |
+
# Import our modules
|
| 15 |
+
from detection_model import ObjectDetector
|
| 16 |
+
from depth_model import DepthEstimator
|
| 17 |
+
from bbox3d_utils import BBox3DEstimator, BirdEyeView
|
| 18 |
+
from load_camera_params import load_camera_params, apply_camera_params_to_estimator
|
| 19 |
+
|
| 20 |
+
def main():
|
| 21 |
+
"""Main function."""
|
| 22 |
+
# Configuration variables (modify these as needed)
|
| 23 |
+
# ===============================================
|
| 24 |
+
|
| 25 |
+
# Input/Output
|
| 26 |
+
source = 0 # Path to input video file or webcam index (0 for default camera)
|
| 27 |
+
output_path = "output.mp4" # Path to output video file
|
| 28 |
+
|
| 29 |
+
# Model settings
|
| 30 |
+
yolo_model_size = "nano" # YOLOv11 model size: "nano", "small", "medium", "large", "extra"
|
| 31 |
+
depth_model_size = "small" # Depth Anything v2 model size: "small", "base", "large"
|
| 32 |
+
|
| 33 |
+
# Device settings
|
| 34 |
+
device = 'cpu' # Force CPU for stability
|
| 35 |
+
|
| 36 |
+
# Detection settings
|
| 37 |
+
conf_threshold = 0.25 # Confidence threshold for object detection
|
| 38 |
+
iou_threshold = 0.45 # IoU threshold for NMS
|
| 39 |
+
classes = None # Filter by class, e.g., [0, 1, 2] for specific classes, None for all classes
|
| 40 |
+
|
| 41 |
+
# Feature toggles
|
| 42 |
+
enable_tracking = True # Enable object tracking
|
| 43 |
+
enable_bev = True # Enable Bird's Eye View visualization
|
| 44 |
+
enable_pseudo_3d = True # Enable pseudo-3D visualization
|
| 45 |
+
|
| 46 |
+
# Camera parameters - simplified approach
|
| 47 |
+
camera_params_file = None # Path to camera parameters file (None to use default parameters)
|
| 48 |
+
# ===============================================
|
| 49 |
+
|
| 50 |
+
print(f"Using device: {device}")
|
| 51 |
+
|
| 52 |
+
# Initialize models
|
| 53 |
+
print("Initializing models...")
|
| 54 |
+
try:
|
| 55 |
+
detector = ObjectDetector(
|
| 56 |
+
model_size=yolo_model_size,
|
| 57 |
+
conf_thres=conf_threshold,
|
| 58 |
+
iou_thres=iou_threshold,
|
| 59 |
+
classes=classes,
|
| 60 |
+
device=device
|
| 61 |
+
)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"Error initializing object detector: {e}")
|
| 64 |
+
print("Falling back to CPU for object detection")
|
| 65 |
+
detector = ObjectDetector(
|
| 66 |
+
model_size=yolo_model_size,
|
| 67 |
+
conf_thres=conf_threshold,
|
| 68 |
+
iou_thres=iou_threshold,
|
| 69 |
+
classes=classes,
|
| 70 |
+
device='cpu'
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
depth_estimator = DepthEstimator(
|
| 75 |
+
model_size=depth_model_size,
|
| 76 |
+
device=device
|
| 77 |
+
)
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"Error initializing depth estimator: {e}")
|
| 80 |
+
print("Falling back to CPU for depth estimation")
|
| 81 |
+
depth_estimator = DepthEstimator(
|
| 82 |
+
model_size=depth_model_size,
|
| 83 |
+
device='cpu'
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Initialize 3D bounding box estimator with default parameters
|
| 87 |
+
# Simplified approach - focus on 2D detection with depth information
|
| 88 |
+
bbox3d_estimator = BBox3DEstimator()
|
| 89 |
+
|
| 90 |
+
# Initialize Bird's Eye View if enabled
|
| 91 |
+
if enable_bev:
|
| 92 |
+
# Use a scale that works well for the 1-5 meter range
|
| 93 |
+
bev = BirdEyeView(scale=60, size=(300, 300)) # Increased scale to spread objects out
|
| 94 |
+
|
| 95 |
+
# Open video source
|
| 96 |
+
try:
|
| 97 |
+
if isinstance(source, str) and source.isdigit():
|
| 98 |
+
source = int(source) # Convert string number to integer for webcam
|
| 99 |
+
except ValueError:
|
| 100 |
+
pass # Keep as string (for video file)
|
| 101 |
+
|
| 102 |
+
print(f"Opening video source: {source}")
|
| 103 |
+
cap = cv2.VideoCapture(source)
|
| 104 |
+
|
| 105 |
+
if not cap.isOpened():
|
| 106 |
+
print(f"Error: Could not open video source {source}")
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
# Get video properties
|
| 110 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 111 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 112 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
| 113 |
+
if fps == 0: # Sometimes happens with webcams
|
| 114 |
+
fps = 30
|
| 115 |
+
|
| 116 |
+
# Initialize video writer
|
| 117 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 118 |
+
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
|
| 119 |
+
|
| 120 |
+
# Initialize variables for FPS calculation
|
| 121 |
+
frame_count = 0
|
| 122 |
+
start_time = time.time()
|
| 123 |
+
fps_display = "FPS: --"
|
| 124 |
+
|
| 125 |
+
print("Starting processing...")
|
| 126 |
+
|
| 127 |
+
# Main loop
|
| 128 |
+
while True:
|
| 129 |
+
# Check for key press at the beginning of each loop
|
| 130 |
+
key = cv2.waitKey(1)
|
| 131 |
+
if key == ord('q') or key == 27 or (key & 0xFF) == ord('q') or (key & 0xFF) == 27:
|
| 132 |
+
print("Exiting program...")
|
| 133 |
+
break
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
# Read frame
|
| 137 |
+
ret, frame = cap.read()
|
| 138 |
+
if not ret:
|
| 139 |
+
break
|
| 140 |
+
|
| 141 |
+
# Make copies for different visualizations
|
| 142 |
+
original_frame = frame.copy()
|
| 143 |
+
detection_frame = frame.copy()
|
| 144 |
+
depth_frame = frame.copy()
|
| 145 |
+
result_frame = frame.copy()
|
| 146 |
+
|
| 147 |
+
# Step 1: Object Detection
|
| 148 |
+
try:
|
| 149 |
+
detection_frame, detections = detector.detect(detection_frame, track=enable_tracking)
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"Error during object detection: {e}")
|
| 152 |
+
detections = []
|
| 153 |
+
cv2.putText(detection_frame, "Detection Error", (10, 60),
|
| 154 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
| 155 |
+
|
| 156 |
+
# Step 2: Depth Estimation
|
| 157 |
+
try:
|
| 158 |
+
depth_map = depth_estimator.estimate_depth(original_frame)
|
| 159 |
+
depth_colored = depth_estimator.colorize_depth(depth_map)
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"Error during depth estimation: {e}")
|
| 162 |
+
# Create a dummy depth map
|
| 163 |
+
depth_map = np.zeros((height, width), dtype=np.float32)
|
| 164 |
+
depth_colored = np.zeros((height, width, 3), dtype=np.uint8)
|
| 165 |
+
cv2.putText(depth_colored, "Depth Error", (10, 60),
|
| 166 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
| 167 |
+
|
| 168 |
+
# Step 3: 3D Bounding Box Estimation
|
| 169 |
+
boxes_3d = []
|
| 170 |
+
active_ids = []
|
| 171 |
+
|
| 172 |
+
for detection in detections:
|
| 173 |
+
try:
|
| 174 |
+
bbox, score, class_id, obj_id = detection
|
| 175 |
+
|
| 176 |
+
# Get class name
|
| 177 |
+
class_name = detector.get_class_names()[class_id]
|
| 178 |
+
|
| 179 |
+
# Get depth in the region of the bounding box
|
| 180 |
+
# Try different methods for depth estimation
|
| 181 |
+
if class_name.lower() in ['person', 'cat', 'dog']:
|
| 182 |
+
# For people and animals, use the center point depth
|
| 183 |
+
center_x = int((bbox[0] + bbox[2]) / 2)
|
| 184 |
+
center_y = int((bbox[1] + bbox[3]) / 2)
|
| 185 |
+
depth_value = depth_estimator.get_depth_at_point(depth_map, center_x, center_y)
|
| 186 |
+
depth_method = 'center'
|
| 187 |
+
else:
|
| 188 |
+
# For other objects, use the median depth in the region
|
| 189 |
+
depth_value = depth_estimator.get_depth_in_region(depth_map, bbox, method='median')
|
| 190 |
+
depth_method = 'median'
|
| 191 |
+
|
| 192 |
+
# Create a simplified 3D box representation
|
| 193 |
+
box_3d = {
|
| 194 |
+
'bbox_2d': bbox,
|
| 195 |
+
'depth_value': depth_value,
|
| 196 |
+
'depth_method': depth_method,
|
| 197 |
+
'class_name': class_name,
|
| 198 |
+
'object_id': obj_id,
|
| 199 |
+
'score': score
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
boxes_3d.append(box_3d)
|
| 203 |
+
|
| 204 |
+
# Keep track of active IDs for tracker cleanup
|
| 205 |
+
if obj_id is not None:
|
| 206 |
+
active_ids.append(obj_id)
|
| 207 |
+
except Exception as e:
|
| 208 |
+
print(f"Error processing detection: {e}")
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
# Clean up trackers for objects that are no longer detected
|
| 212 |
+
bbox3d_estimator.cleanup_trackers(active_ids)
|
| 213 |
+
|
| 214 |
+
# Step 4: Visualization
|
| 215 |
+
# Draw boxes on the result frame
|
| 216 |
+
for box_3d in boxes_3d:
|
| 217 |
+
try:
|
| 218 |
+
# Determine color based on class
|
| 219 |
+
class_name = box_3d['class_name'].lower()
|
| 220 |
+
if 'car' in class_name or 'vehicle' in class_name:
|
| 221 |
+
color = (0, 0, 255) # Red
|
| 222 |
+
elif 'person' in class_name:
|
| 223 |
+
color = (0, 255, 0) # Green
|
| 224 |
+
elif 'bicycle' in class_name or 'motorcycle' in class_name:
|
| 225 |
+
color = (255, 0, 0) # Blue
|
| 226 |
+
elif 'potted plant' in class_name or 'plant' in class_name:
|
| 227 |
+
color = (0, 255, 255) # Yellow
|
| 228 |
+
else:
|
| 229 |
+
color = (255, 255, 255) # White
|
| 230 |
+
|
| 231 |
+
# Draw box with depth information
|
| 232 |
+
result_frame = bbox3d_estimator.draw_box_3d(result_frame, box_3d, color=color)
|
| 233 |
+
except Exception as e:
|
| 234 |
+
print(f"Error drawing box: {e}")
|
| 235 |
+
continue
|
| 236 |
+
|
| 237 |
+
# Draw Bird's Eye View if enabled
|
| 238 |
+
if enable_bev:
|
| 239 |
+
try:
|
| 240 |
+
# Reset BEV and draw objects
|
| 241 |
+
bev.reset()
|
| 242 |
+
for box_3d in boxes_3d:
|
| 243 |
+
bev.draw_box(box_3d)
|
| 244 |
+
bev_image = bev.get_image()
|
| 245 |
+
|
| 246 |
+
# Resize BEV image to fit in the corner of the result frame
|
| 247 |
+
bev_height = height // 4 # Reduced from height/3 to height/4 for better fit
|
| 248 |
+
bev_width = bev_height
|
| 249 |
+
|
| 250 |
+
# Ensure dimensions are valid
|
| 251 |
+
if bev_height > 0 and bev_width > 0:
|
| 252 |
+
# Resize BEV image
|
| 253 |
+
bev_resized = cv2.resize(bev_image, (bev_width, bev_height))
|
| 254 |
+
|
| 255 |
+
# Create a region of interest in the result frame
|
| 256 |
+
roi = result_frame[height - bev_height:height, 0:bev_width]
|
| 257 |
+
|
| 258 |
+
# Simple overlay - just copy the BEV image to the ROI
|
| 259 |
+
result_frame[height - bev_height:height, 0:bev_width] = bev_resized
|
| 260 |
+
|
| 261 |
+
# Add a border around the BEV visualization
|
| 262 |
+
cv2.rectangle(result_frame,
|
| 263 |
+
(0, height - bev_height),
|
| 264 |
+
(bev_width, height),
|
| 265 |
+
(255, 255, 255), 1)
|
| 266 |
+
|
| 267 |
+
# Add a title to the BEV visualization
|
| 268 |
+
cv2.putText(result_frame, "Bird's Eye View",
|
| 269 |
+
(10, height - bev_height + 20),
|
| 270 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
| 271 |
+
except Exception as e:
|
| 272 |
+
print(f"Error drawing BEV: {e}")
|
| 273 |
+
|
| 274 |
+
# Calculate and display FPS
|
| 275 |
+
frame_count += 1
|
| 276 |
+
if frame_count % 10 == 0: # Update FPS every 10 frames
|
| 277 |
+
end_time = time.time()
|
| 278 |
+
elapsed_time = end_time - start_time
|
| 279 |
+
fps_value = frame_count / elapsed_time
|
| 280 |
+
fps_display = f"FPS: {fps_value:.1f}"
|
| 281 |
+
|
| 282 |
+
# Add FPS and device info to the result frame
|
| 283 |
+
cv2.putText(result_frame, f"{fps_display} | Device: {device}", (10, 30),
|
| 284 |
+
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
# Add depth map to the corner of the result frame
|
| 288 |
+
try:
|
| 289 |
+
depth_height = height // 4
|
| 290 |
+
depth_width = depth_height * width // height
|
| 291 |
+
depth_resized = cv2.resize(depth_colored, (depth_width, depth_height))
|
| 292 |
+
result_frame[0:depth_height, 0:depth_width] = depth_resized
|
| 293 |
+
except Exception as e:
|
| 294 |
+
print(f"Error adding depth map to result: {e}")
|
| 295 |
+
|
| 296 |
+
# Write frame to output video
|
| 297 |
+
out.write(result_frame)
|
| 298 |
+
|
| 299 |
+
# Display frames
|
| 300 |
+
cv2.imshow("3D Object Detection", result_frame)
|
| 301 |
+
cv2.imshow("Depth Map", depth_colored)
|
| 302 |
+
cv2.imshow("Object Detection", detection_frame)
|
| 303 |
+
|
| 304 |
+
# Check for key press again at the end of the loop
|
| 305 |
+
key = cv2.waitKey(1)
|
| 306 |
+
if key == ord('q') or key == 27 or (key & 0xFF) == ord('q') or (key & 0xFF) == 27:
|
| 307 |
+
print("Exiting program...")
|
| 308 |
+
break
|
| 309 |
+
|
| 310 |
+
except Exception as e:
|
| 311 |
+
print(f"Error processing frame: {e}")
|
| 312 |
+
# Also check for key press during exception handling
|
| 313 |
+
key = cv2.waitKey(1)
|
| 314 |
+
if key == ord('q') or key == 27 or (key & 0xFF) == ord('q') or (key & 0xFF) == 27:
|
| 315 |
+
print("Exiting program...")
|
| 316 |
+
break
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
# Clean up
|
| 320 |
+
print("Cleaning up resources...")
|
| 321 |
+
cap.release()
|
| 322 |
+
out.release()
|
| 323 |
+
cv2.destroyAllWindows()
|
| 324 |
+
|
| 325 |
+
print(f"Processing complete. Output saved to {output_path}")
|
| 326 |
+
|
| 327 |
+
if __name__ == "__main__":
|
| 328 |
+
try:
|
| 329 |
+
main()
|
| 330 |
+
except KeyboardInterrupt:
|
| 331 |
+
print("\nProgram interrupted by user (Ctrl+C)")
|
| 332 |
+
# Clean up OpenCV windows
|
| 333 |
+
cv2.destroyAllWindows()
|