injected_thinking / third_party /depth_demo.py
BechusRantus's picture
Upload folder using huggingface_hub
7134ce7 verified
# Copyright (c) Kangan Qian. All rights reserved.
# Authors: Kangan Qian (Tsinghua University, Xiaomi Corporation)
# Description: 3D location estimation from 2D images using depth estimation
import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
from third_party.DAM.depth_anything_v2.dpt import DepthAnythingV2
from third_party.yoloworld_demo import get_2dloc_open_vocabulary_detector
def calculate_average_depth(depth_map: np.ndarray, center_pixel: tuple, neighborhood_size: int = 5) -> float:
"""
Calculate the average depth value around a specified pixel
Args:
depth_map (np.ndarray): HxW depth map
center_pixel (tuple): (x, y) coordinates of the center pixel
neighborhood_size (int): Size of the neighborhood area (default: 5x5)
Returns:
float: Average depth value in the neighborhood
"""
half_size = neighborhood_size // 2
x, y = center_pixel
# Ensure boundaries are within image dimensions
x_start = max(0, x - half_size)
x_end = min(depth_map.shape[1], x + half_size + 1)
y_start = max(0, y - half_size)
y_end = min(depth_map.shape[0], y + half_size + 1)
# Extract neighborhood depths
neighborhood_depths = depth_map[y_start:y_end, x_start:x_end]
# Calculate average depth ignoring zero values
valid_depths = neighborhood_depths[neighborhood_depths != 0]
if valid_depths.size == 0:
return 0.0
return np.mean(valid_depths)
def pixel_to_3d_coordinates(
depth_map: np.ndarray,
camera_intrinsic: np.ndarray,
pixel: tuple
) -> tuple:
"""
Convert 2D pixel coordinates to 3D world coordinates using depth map and camera intrinsics
Args:
depth_map (np.ndarray): Depth map in meters
camera_intrinsic (np.ndarray): 4x4 camera intrinsic matrix
pixel (tuple): Pixel coordinates (x, y)
Returns:
tuple: 3D coordinates (X, Y, Z) in camera coordinate system
"""
x, y = pixel
# Calculate average depth in neighborhood for robustness
avg_depth = calculate_average_depth(depth_map, pixel)
if avg_depth != 0:
D = avg_depth
else:
D = depth_map[y, x]
# Extract camera intrinsic parameters
fx = camera_intrinsic[0, 0]
fy = camera_intrinsic[1, 1]
u0 = camera_intrinsic[0, 2]
v0 = camera_intrinsic[1, 2]
# Calculate 3D coordinates in camera frame
X_cam = (x - u0) * D / fx
Y_cam = (y - v0) * D / fy
Z_cam = D
return X_cam, Y_cam, Z_cam
def get_3d_location(
text: list = ['car'],
image_path: str = None,
debug: bool = False,
model_path: str = "./pretrained_model/depth_anything_v2_vitb.pth",
encoder_type: str = "vitb",
camera_intrinsic: np.ndarray = None
) -> tuple:
"""
Estimate 3D locations of objects in an image
Args:
text (list): List of object names to locate
image_path (str): Path to input image
debug (bool): Whether to save debug visualization
model_path (str): Path to depth estimation model
encoder_type (str): Encoder type for depth model
camera_intrinsic (np.ndarray): Camera intrinsic matrix
Returns:
tuple:
- prompt (str): Description of 3D locations
- spatial_location (list): List of 3D coordinates for each object
"""
# Default camera intrinsic matrix if not provided
if camera_intrinsic is None:
camera_intrinsic = np.array([
[1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
[0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
])
# Configure depth model based on encoder type
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
# Initialize and load depth estimation model
model = DepthAnythingV2(**model_configs[encoder_type]).to('cuda')
model.load_state_dict(torch.load(model_path))
model.eval()
# Load and process image
raw_img = cv2.imread(image_path)
depth_map = model.infer_image(raw_img) # HxW depth map
# Invert depth map if needed (depends on model output)
max_depth = np.max(depth_map)
depth_map = max_depth - depth_map
# Save debug visualization if requested
if debug:
plt.imshow(depth_map, cmap='jet')
plt.colorbar()
plt.title("Predicted Depth Map")
plt.savefig("./debug/debug_depth.png")
# Process each object
prompt = ""
spatial_location = []
for obj in text:
# Get 2D location using open-vocabulary detector
loc2d_prompt, location_2d = get_2dloc_open_vocabulary_detector(
text=[obj],
image_path=image_path
)
prompt += loc2d_prompt
# Handle case where 2D location not found
if location_2d is None:
prompt += f"\nFailed to estimate 3D location for {obj}. You must infer or identify it yourself."
continue
# Convert to integer pixel coordinates
pixel = [int(round(coord)) for coord in location_2d]
# Calculate 3D coordinates
X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
spatial_location.append([X, Y, Z])
prompt += f"\nEstimated 3D location(x,y,z) for {obj} in camera coordinates: [{X:.2f}, {Y:.2f}, {Z:.2f}], z={Z:.2f}"
prompt += f"""(Note: The Z coordinate represents depth, with smaller Z-values indicating closer proximity to the camera/front.)"""
return prompt, spatial_location
if __name__ == '__main__':
# Example usage
image_path = "./third_party/nuscenes_CAM_FRONT_5978.webp"
model_path = "./pretrained_model/depth_anything_v2_vitb.pth"
# Camera intrinsic matrix
camera_intrinsic = np.array([
[1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
[0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
])
# Initialize depth model
model = DepthAnythingV2(
encoder='vitb',
features=128,
out_channels=[96, 192, 384, 768]
).to('cuda')
model.load_state_dict(torch.load(model_path))
model.eval()
# Load image and estimate depth
raw_img = cv2.imread(image_path)
depth_map = model.infer_image(raw_img)
# Test conversion for a specific pixel
pixel = (1164, 627)
X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
print(f"3D coordinates in camera frame: X = {X:.3f} m, Y = {Y:.3f} m, Z = {Z:.3f} m")