|
|
|
|
|
|
|
|
| import cv2
|
| import numpy as np
|
| import torch
|
| import matplotlib.pyplot as plt
|
| from PIL import Image
|
| from third_party.DAM.depth_anything_v2.dpt import DepthAnythingV2
|
| from third_party.yoloworld_demo import get_2dloc_open_vocabulary_detector
|
|
|
|
|
| def calculate_average_depth(depth_map: np.ndarray, center_pixel: tuple, neighborhood_size: int = 5) -> float:
|
| """
|
| Calculate the average depth value around a specified pixel
|
|
|
| Args:
|
| depth_map (np.ndarray): HxW depth map
|
| center_pixel (tuple): (x, y) coordinates of the center pixel
|
| neighborhood_size (int): Size of the neighborhood area (default: 5x5)
|
|
|
| Returns:
|
| float: Average depth value in the neighborhood
|
| """
|
| half_size = neighborhood_size // 2
|
| x, y = center_pixel
|
|
|
|
|
| x_start = max(0, x - half_size)
|
| x_end = min(depth_map.shape[1], x + half_size + 1)
|
| y_start = max(0, y - half_size)
|
| y_end = min(depth_map.shape[0], y + half_size + 1)
|
|
|
|
|
| neighborhood_depths = depth_map[y_start:y_end, x_start:x_end]
|
|
|
|
|
| valid_depths = neighborhood_depths[neighborhood_depths != 0]
|
| if valid_depths.size == 0:
|
| return 0.0
|
|
|
| return np.mean(valid_depths)
|
|
|
|
|
| def pixel_to_3d_coordinates(
|
| depth_map: np.ndarray,
|
| camera_intrinsic: np.ndarray,
|
| pixel: tuple
|
| ) -> tuple:
|
| """
|
| Convert 2D pixel coordinates to 3D world coordinates using depth map and camera intrinsics
|
|
|
| Args:
|
| depth_map (np.ndarray): Depth map in meters
|
| camera_intrinsic (np.ndarray): 4x4 camera intrinsic matrix
|
| pixel (tuple): Pixel coordinates (x, y)
|
|
|
| Returns:
|
| tuple: 3D coordinates (X, Y, Z) in camera coordinate system
|
| """
|
| x, y = pixel
|
|
|
|
|
| avg_depth = calculate_average_depth(depth_map, pixel)
|
| if avg_depth != 0:
|
| D = avg_depth
|
| else:
|
| D = depth_map[y, x]
|
|
|
|
|
| fx = camera_intrinsic[0, 0]
|
| fy = camera_intrinsic[1, 1]
|
| u0 = camera_intrinsic[0, 2]
|
| v0 = camera_intrinsic[1, 2]
|
|
|
|
|
| X_cam = (x - u0) * D / fx
|
| Y_cam = (y - v0) * D / fy
|
| Z_cam = D
|
|
|
| return X_cam, Y_cam, Z_cam
|
|
|
|
|
| def get_3d_location(
|
| text: list = ['car'],
|
| image_path: str = None,
|
| debug: bool = False,
|
| model_path: str = "./pretrained_model/depth_anything_v2_vitb.pth",
|
| encoder_type: str = "vitb",
|
| camera_intrinsic: np.ndarray = None
|
| ) -> tuple:
|
| """
|
| Estimate 3D locations of objects in an image
|
|
|
| Args:
|
| text (list): List of object names to locate
|
| image_path (str): Path to input image
|
| debug (bool): Whether to save debug visualization
|
| model_path (str): Path to depth estimation model
|
| encoder_type (str): Encoder type for depth model
|
| camera_intrinsic (np.ndarray): Camera intrinsic matrix
|
|
|
| Returns:
|
| tuple:
|
| - prompt (str): Description of 3D locations
|
| - spatial_location (list): List of 3D coordinates for each object
|
| """
|
|
|
| if camera_intrinsic is None:
|
| camera_intrinsic = np.array([
|
| [1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
|
| [0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
|
| [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
|
| [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
|
| ])
|
|
|
|
|
| model_configs = {
|
| 'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
|
| 'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
|
| 'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
|
| 'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
|
| }
|
|
|
|
|
| model = DepthAnythingV2(**model_configs[encoder_type]).to('cuda')
|
| model.load_state_dict(torch.load(model_path))
|
| model.eval()
|
|
|
|
|
| raw_img = cv2.imread(image_path)
|
| depth_map = model.infer_image(raw_img)
|
|
|
|
|
| max_depth = np.max(depth_map)
|
| depth_map = max_depth - depth_map
|
|
|
|
|
| if debug:
|
| plt.imshow(depth_map, cmap='jet')
|
| plt.colorbar()
|
| plt.title("Predicted Depth Map")
|
| plt.savefig("./debug/debug_depth.png")
|
|
|
|
|
| prompt = ""
|
| spatial_location = []
|
| for obj in text:
|
|
|
| loc2d_prompt, location_2d = get_2dloc_open_vocabulary_detector(
|
| text=[obj],
|
| image_path=image_path
|
| )
|
|
|
| prompt += loc2d_prompt
|
|
|
|
|
| if location_2d is None:
|
| prompt += f"\nFailed to estimate 3D location for {obj}. You must infer or identify it yourself."
|
| continue
|
|
|
|
|
| pixel = [int(round(coord)) for coord in location_2d]
|
|
|
|
|
| X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
|
| spatial_location.append([X, Y, Z])
|
|
|
| prompt += f"\nEstimated 3D location(x,y,z) for {obj} in camera coordinates: [{X:.2f}, {Y:.2f}, {Z:.2f}], z={Z:.2f}"
|
|
|
| prompt += f"""(Note: The Z coordinate represents depth, with smaller Z-values indicating closer proximity to the camera/front.)"""
|
| return prompt, spatial_location
|
|
|
|
|
| if __name__ == '__main__':
|
|
|
| image_path = "./third_party/nuscenes_CAM_FRONT_5978.webp"
|
| model_path = "./pretrained_model/depth_anything_v2_vitb.pth"
|
|
|
|
|
| camera_intrinsic = np.array([
|
| [1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
|
| [0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
|
| [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
|
| [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
|
| ])
|
|
|
|
|
| model = DepthAnythingV2(
|
| encoder='vitb',
|
| features=128,
|
| out_channels=[96, 192, 384, 768]
|
| ).to('cuda')
|
| model.load_state_dict(torch.load(model_path))
|
| model.eval()
|
|
|
|
|
| raw_img = cv2.imread(image_path)
|
| depth_map = model.infer_image(raw_img)
|
|
|
|
|
| pixel = (1164, 627)
|
| X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
|
|
|
| print(f"3D coordinates in camera frame: X = {X:.3f} m, Y = {Y:.3f} m, Z = {Z:.3f} m") |