Upload folder using huggingface_hub

7134ce7 verified about 2 months ago

7.54 kB

	# Copyright (c) Kangan Qian. All rights reserved.
	# Authors: Kangan Qian (Tsinghua University, Xiaomi Corporation)
	# Description: 3D location estimation from 2D images using depth estimation

	import cv2
	import numpy as np
	import torch
	import matplotlib.pyplot as plt
	from PIL import Image
	from third_party.DAM.depth_anything_v2.dpt import DepthAnythingV2
	from third_party.yoloworld_demo import get_2dloc_open_vocabulary_detector


	def calculate_average_depth(depth_map: np.ndarray, center_pixel: tuple, neighborhood_size: int = 5) -> float:
	"""
	Calculate the average depth value around a specified pixel

	Args:
	depth_map (np.ndarray): HxW depth map
	center_pixel (tuple): (x, y) coordinates of the center pixel
	neighborhood_size (int): Size of the neighborhood area (default: 5x5)

	Returns:
	float: Average depth value in the neighborhood
	"""
	half_size = neighborhood_size // 2
	x, y = center_pixel

	# Ensure boundaries are within image dimensions
	x_start = max(0, x - half_size)
	x_end = min(depth_map.shape[1], x + half_size + 1)
	y_start = max(0, y - half_size)
	y_end = min(depth_map.shape[0], y + half_size + 1)

	# Extract neighborhood depths
	neighborhood_depths = depth_map[y_start:y_end, x_start:x_end]

	# Calculate average depth ignoring zero values
	valid_depths = neighborhood_depths[neighborhood_depths != 0]
	if valid_depths.size == 0:
	return 0.0

	return np.mean(valid_depths)


	def pixel_to_3d_coordinates(
	depth_map: np.ndarray,
	camera_intrinsic: np.ndarray,
	pixel: tuple
	) -> tuple:
	"""
	Convert 2D pixel coordinates to 3D world coordinates using depth map and camera intrinsics

	Args:
	depth_map (np.ndarray): Depth map in meters
	camera_intrinsic (np.ndarray): 4x4 camera intrinsic matrix
	pixel (tuple): Pixel coordinates (x, y)

	Returns:
	tuple: 3D coordinates (X, Y, Z) in camera coordinate system
	"""
	x, y = pixel

	# Calculate average depth in neighborhood for robustness
	avg_depth = calculate_average_depth(depth_map, pixel)
	if avg_depth != 0:
	D = avg_depth
	else:
	D = depth_map[y, x]

	# Extract camera intrinsic parameters
	fx = camera_intrinsic[0, 0]
	fy = camera_intrinsic[1, 1]
	u0 = camera_intrinsic[0, 2]
	v0 = camera_intrinsic[1, 2]

	# Calculate 3D coordinates in camera frame
	X_cam = (x - u0) * D / fx
	Y_cam = (y - v0) * D / fy
	Z_cam = D

	return X_cam, Y_cam, Z_cam


	def get_3d_location(
	text: list = ['car'],
	image_path: str = None,
	debug: bool = False,
	model_path: str = "./pretrained_model/depth_anything_v2_vitb.pth",
	encoder_type: str = "vitb",
	camera_intrinsic: np.ndarray = None
	) -> tuple:
	"""
	Estimate 3D locations of objects in an image

	Args:
	text (list): List of object names to locate
	image_path (str): Path to input image
	debug (bool): Whether to save debug visualization
	model_path (str): Path to depth estimation model
	encoder_type (str): Encoder type for depth model
	camera_intrinsic (np.ndarray): Camera intrinsic matrix

	Returns:
	tuple:
	- prompt (str): Description of 3D locations
	- spatial_location (list): List of 3D coordinates for each object
	"""
	# Default camera intrinsic matrix if not provided
	if camera_intrinsic is None:
	camera_intrinsic = np.array([
	[1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
	[0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
	[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
	[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
	])

	# Configure depth model based on encoder type
	model_configs = {
	'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
	'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
	'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
	'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
	}

	# Initialize and load depth estimation model
	model = DepthAnythingV2(**model_configs[encoder_type]).to('cuda')
	model.load_state_dict(torch.load(model_path))
	model.eval()

	# Load and process image
	raw_img = cv2.imread(image_path)
	depth_map = model.infer_image(raw_img) # HxW depth map

	# Invert depth map if needed (depends on model output)
	max_depth = np.max(depth_map)
	depth_map = max_depth - depth_map

	# Save debug visualization if requested
	if debug:
	plt.imshow(depth_map, cmap='jet')
	plt.colorbar()
	plt.title("Predicted Depth Map")
	plt.savefig("./debug/debug_depth.png")

	# Process each object
	prompt = ""
	spatial_location = []
	for obj in text:
	# Get 2D location using open-vocabulary detector
	loc2d_prompt, location_2d = get_2dloc_open_vocabulary_detector(
	text=[obj],
	image_path=image_path
	)

	prompt += loc2d_prompt

	# Handle case where 2D location not found
	if location_2d is None:
	prompt += f"\nFailed to estimate 3D location for {obj}. You must infer or identify it yourself."
	continue

	# Convert to integer pixel coordinates
	pixel = [int(round(coord)) for coord in location_2d]

	# Calculate 3D coordinates
	X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)
	spatial_location.append([X, Y, Z])

	prompt += f"\nEstimated 3D location(x,y,z) for {obj} in camera coordinates: [{X:.2f}, {Y:.2f}, {Z:.2f}], z={Z:.2f}"

	prompt += f"""(Note: The Z coordinate represents depth, with smaller Z-values indicating closer proximity to the camera/front.)"""
	return prompt, spatial_location


	if __name__ == '__main__':
	# Example usage
	image_path = "./third_party/nuscenes_CAM_FRONT_5978.webp"
	model_path = "./pretrained_model/depth_anything_v2_vitb.pth"

	# Camera intrinsic matrix
	camera_intrinsic = np.array([
	[1.25281310e+03, 0.00000000e+00, 8.26588115e+02, 0.00000000e+00],
	[0.00000000e+00, 1.25281310e+03, 4.69984663e+02, 0.00000000e+00],
	[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
	[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00]
	])

	# Initialize depth model
	model = DepthAnythingV2(
	encoder='vitb',
	features=128,
	out_channels=[96, 192, 384, 768]
	).to('cuda')
	model.load_state_dict(torch.load(model_path))
	model.eval()

	# Load image and estimate depth
	raw_img = cv2.imread(image_path)
	depth_map = model.infer_image(raw_img)

	# Test conversion for a specific pixel
	pixel = (1164, 627)
	X, Y, Z = pixel_to_3d_coordinates(depth_map, camera_intrinsic, pixel)

	print(f"3D coordinates in camera frame: X = {X:.3f} m, Y = {Y:.3f} m, Z = {Z:.3f} m")