File size: 7,540 Bytes
7134ce7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# Copyright (c) Kangan Qian. All rights reserved.
# Authors: Kangan Qian (Tsinghua University, Xiaomi Corporation)
# Description: 3D location estimation from 2D images using depth estimation

import cv2
import numpy as np
import torch
import matplotlib.pyplot as plt
from PIL import Image
from third_party.DAM.depth_anything_v2.dpt import DepthAnythingV2
from third_party.yoloworld_demo import get_2dloc_open_vocabulary_detector


def calculate_average_depth(depth_map: np.ndarray, center_pixel: tuple, neighborhood_size: int = 5) -> float:
    """

    Calculate the average depth value around a specified pixel

    

    Args:

        depth_map (np.ndarray): HxW depth map

        center_pixel (tuple): (x, y) coordinates of the center pixel

        neighborhood_size (int): Size of the neighborhood area (default: 5x5)

        

    Returns:

        float: Average depth value in the neighborhood

    """
    half_size = neighborhood_size // 2
    x, y = center_pixel
    
    # Ensure boundaries are within image dimensions
    x_start = max(0, x - half_size)
    x_end = min(depth_map.shape[1], x + half_size + 1)
    y_start = max(0, y - half_size)
    y_end = min(depth_map.shape[0], y + half_size + 1)
    
    # Extract neighborhood depths
    neighborhood_depths = depth_map[y_start:y_end, x_start:x_end]
    
    # Calculate average depth ignoring zero values
    valid_depths = neighborhood_depths[neighborhood_depths != 0]
    if valid_depths.size == 0:
        return 0.0
    
    return np.mean(valid_depths)


def pixel_to_3d_coordinates(

        depth_map: np.ndarray, 

        camera_intrinsic: np.ndarray, 

        pixel: tuple

    ) -> tuple:
    """

    Convert 2D pixel coordinates to 3D world coordinates using depth map and camera intrinsics

    

    Args:

        depth_map (np.ndarray): Depth map in meters

        camera_intrinsic (np.ndarray): 4x4 camera intrinsic matrix

        pixel (tuple): Pixel coordinates (x, y)

        

    Returns:

        tuple: 3D coordinates (X, Y, Z) in camera coordinate system

    """
    x, y = pixel
    
    # Calculate average depth in neighborhood for robustness
    avg_depth = calculate_average_depth(depth_map, pixel)
    if avg_depth != 0:
        D = avg_depth
    else:
        D = depth_map[y, x]
    
    # Extract camera intrinsic parameters
    fx = camera_intrinsic[0, 0]
    fy = camera_intrinsic[1, 1]
    u0 = camera_intrinsic[0, 2]
    v0 = camera_intrinsic[1, 2]
    
    # Calculate 3D coordinates in camera frame
    X_cam = (x - u0) * D / fx
    Y_cam = (y - v0) * D / fy
    Z_cam = D
    
    return X_cam, Y_cam, Z_cam


def get_3d_location(

        text: list = ['car'], 

        image_path: str = None, 

        debug: bool = False,

        model_path: str = "./pretrained_model/depth_anything_v2_vitb.pth",

        encoder_type: str = "vitb",

        camera_intrinsic: np.ndarray = None

    ) -> tuple:
    """

    Estimate 3D locations of objects in an image

    

    Args:

        text (list): List of object names to locate

        image_path (str): Path to input image

        debug (bool): Whether to save debug visualization

        model_path (str): Path to depth estimation model

        encoder_type (str): Encoder type for depth model

        camera_intrinsic (np.ndarray): Camera intrinsic matrix

        

    Returns:

        tuple: 

            - prompt (str): Description of 3D locations

            - spatial_location (list): List of 3D coordinates for each object

    """
    # Default camera intrinsic matrix if not provided
    if camera_intrinsic is None:
        camera_intrinsic = np.array([
            [1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
            [0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
            [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
            [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
        ])
    
    # Configure depth model based on encoder type
    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }
    
    # Initialize and load depth estimation model
    model = DepthAnythingV2(**model_configs[encoder_type]).to('cuda')
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    # Load and process image
    raw_img = cv2.imread(image_path)
    depth_map = model.infer_image(raw_img)  # HxW depth map
    
    # Invert depth map if needed (depends on model output)
    max_depth = np.max(depth_map)
    depth_map = max_depth - depth_map
    
    # Save debug visualization if requested
    if debug:
        plt.imshow(depth_map, cmap='jet')
        plt.colorbar()
        plt.title("Predicted Depth Map")
        plt.savefig("./debug/debug_depth.png")
    
    # Process each object
    prompt = ""
    spatial_location = []
    for obj in text:
        # Get 2D location using open-vocabulary detector
        loc2d_prompt, location_2d = get_2dloc_open_vocabulary_detector(
            text=[obj], 
            image_path=image_path
        )
        
        prompt += loc2d_prompt
        
        # Handle case where 2D location not found
        if location_2d is None:
            prompt += f"\nFailed to estimate 3D location for {obj}. You must infer or identify it yourself."
            continue
        
        # Convert to integer pixel coordinates
        pixel = [int(round(coord)) for coord in location_2d]
        
        # Calculate 3D coordinates
        X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
        spatial_location.append([X, Y, Z])
        
        prompt += f"\nEstimated 3D location(x,y,z) for {obj} in camera coordinates: [{X:.2f}, {Y:.2f}, {Z:.2f}], z={Z:.2f}"
    
    prompt += f"""(Note: The Z coordinate represents depth, with smaller Z-values indicating closer proximity to the camera/front.)"""
    return prompt, spatial_location


if __name__ == '__main__':
    # Example usage
    image_path = "./third_party/nuscenes_CAM_FRONT_5978.webp"
    model_path = "./pretrained_model/depth_anything_v2_vitb.pth"
    
    # Camera intrinsic matrix
    camera_intrinsic = np.array([
        [1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
        [0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
    ])
    
    # Initialize depth model
    model = DepthAnythingV2(
        encoder='vitb',
        features=128,
        out_channels=[96, 192, 384, 768]
    ).to('cuda')
    model.load_state_dict(torch.load(model_path))
    model.eval()
    
    # Load image and estimate depth
    raw_img = cv2.imread(image_path)
    depth_map = model.infer_image(raw_img)
    
    # Test conversion for a specific pixel
    pixel = (1164, 627)
    X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
    
    print(f"3D coordinates in camera frame: X = {X:.3f} m, Y = {Y:.3f} m, Z = {Z:.3f} m")