File size: 7,540 Bytes
7134ce7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | # Copyright (c) Kangan Qian. All rights reserved.
# Authors: Kangan Qian (Tsinghua University, Xiaomi Corporation)
# Description: 3D location estimation from 2D images using depth estimation
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
from third_party.DAM.depth_anything_v2.dpt import DepthAnythingV2
from third_party.yoloworld_demo import get_2dloc_open_vocabulary_detector
def calculate_average_depth(depth_map: np.ndarray, center_pixel: tuple, neighborhood_size: int = 5) -> float:
"""
Calculate the average depth value around a specified pixel
Args:
depth_map (np.ndarray): HxW depth map
center_pixel (tuple): (x, y) coordinates of the center pixel
neighborhood_size (int): Size of the neighborhood area (default: 5x5)
Returns:
float: Average depth value in the neighborhood
"""
half_size = neighborhood_size // 2
x, y = center_pixel
# Ensure boundaries are within image dimensions
x_start = max(0, x - half_size)
x_end = min(depth_map.shape[1], x + half_size + 1)
y_start = max(0, y - half_size)
y_end = min(depth_map.shape[0], y + half_size + 1)
# Extract neighborhood depths
neighborhood_depths = depth_map[y_start:y_end, x_start:x_end]
# Calculate average depth ignoring zero values
valid_depths = neighborhood_depths[neighborhood_depths != 0]
if valid_depths.size == 0:
return 0.0
return np.mean(valid_depths)
def pixel_to_3d_coordinates(
depth_map: np.ndarray,
camera_intrinsic: np.ndarray,
pixel: tuple
) -> tuple:
"""
Convert 2D pixel coordinates to 3D world coordinates using depth map and camera intrinsics
Args:
depth_map (np.ndarray): Depth map in meters
camera_intrinsic (np.ndarray): 4x4 camera intrinsic matrix
pixel (tuple): Pixel coordinates (x, y)
Returns:
tuple: 3D coordinates (X, Y, Z) in camera coordinate system
"""
x, y = pixel
# Calculate average depth in neighborhood for robustness
avg_depth = calculate_average_depth(depth_map, pixel)
if avg_depth != 0:
D = avg_depth
else:
D = depth_map[y, x]
# Extract camera intrinsic parameters
fx = camera_intrinsic[0, 0]
fy = camera_intrinsic[1, 1]
u0 = camera_intrinsic[0, 2]
v0 = camera_intrinsic[1, 2]
# Calculate 3D coordinates in camera frame
X_cam = (x - u0) * D / fx
Y_cam = (y - v0) * D / fy
Z_cam = D
return X_cam, Y_cam, Z_cam
def get_3d_location(
text: list = ['car'],
image_path: str = None,
debug: bool = False,
model_path: str = "./pretrained_model/depth_anything_v2_vitb.pth",
encoder_type: str = "vitb",
camera_intrinsic: np.ndarray = None
) -> tuple:
"""
Estimate 3D locations of objects in an image
Args:
text (list): List of object names to locate
image_path (str): Path to input image
debug (bool): Whether to save debug visualization
model_path (str): Path to depth estimation model
encoder_type (str): Encoder type for depth model
camera_intrinsic (np.ndarray): Camera intrinsic matrix
Returns:
tuple:
- prompt (str): Description of 3D locations
- spatial_location (list): List of 3D coordinates for each object
"""
# Default camera intrinsic matrix if not provided
if camera_intrinsic is None:
camera_intrinsic = np.array([
[1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
[0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
])
# Configure depth model based on encoder type
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
# Initialize and load depth estimation model
model = DepthAnythingV2(**model_configs[encoder_type]).to('cuda')
model.load_state_dict(torch.load(model_path))
model.eval()
# Load and process image
raw_img = cv2.imread(image_path)
depth_map = model.infer_image(raw_img) # HxW depth map
# Invert depth map if needed (depends on model output)
max_depth = np.max(depth_map)
depth_map = max_depth - depth_map
# Save debug visualization if requested
if debug:
plt.imshow(depth_map, cmap='jet')
plt.colorbar()
plt.title("Predicted Depth Map")
plt.savefig("./debug/debug_depth.png")
# Process each object
prompt = ""
spatial_location = []
for obj in text:
# Get 2D location using open-vocabulary detector
loc2d_prompt, location_2d = get_2dloc_open_vocabulary_detector(
text=[obj],
image_path=image_path
)
prompt += loc2d_prompt
# Handle case where 2D location not found
if location_2d is None:
prompt += f"\nFailed to estimate 3D location for {obj}. You must infer or identify it yourself."
continue
# Convert to integer pixel coordinates
pixel = [int(round(coord)) for coord in location_2d]
# Calculate 3D coordinates
X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
spatial_location.append([X, Y, Z])
prompt += f"\nEstimated 3D location(x,y,z) for {obj} in camera coordinates: [{X:.2f}, {Y:.2f}, {Z:.2f}], z={Z:.2f}"
prompt += f"""(Note: The Z coordinate represents depth, with smaller Z-values indicating closer proximity to the camera/front.)"""
return prompt, spatial_location
if __name__ == '__main__':
# Example usage
image_path = "./third_party/nuscenes_CAM_FRONT_5978.webp"
model_path = "./pretrained_model/depth_anything_v2_vitb.pth"
# Camera intrinsic matrix
camera_intrinsic = np.array([
[1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
[0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
])
# Initialize depth model
model = DepthAnythingV2(
encoder='vitb',
features=128,
out_channels=[96, 192, 384, 768]
).to('cuda')
model.load_state_dict(torch.load(model_path))
model.eval()
# Load image and estimate depth
raw_img = cv2.imread(image_path)
depth_map = model.infer_image(raw_img)
# Test conversion for a specific pixel
pixel = (1164, 627)
X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
print(f"3D coordinates in camera frame: X = {X:.3f} m, Y = {Y:.3f} m, Z = {Z:.3f} m") |