File size: 2,169 Bytes
af758d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from enum import Enum

# class DataField(Enum):
class DataField(str, Enum):
    # [B, C, H, W], float32, RGB image ranges from 0 to 1.
    IMAGE_RGB = "image_rgb"
    # [B, 4, 4], float32, camera-to-world transformation matrix.
    CAMERA_C2W_TRANSFORM = "camera_c2w_transform"
    # [B, 4], float32, OpenCV pinhole intrinsics represented as [fx, fy, cx, cy].
    CAMERA_INTRINSICS = "camera_intrinsics"
    # list of captions of size B.
    CAPTION = "caption"
    # [B, H, W], float32, depth map in metric scale.
    METRIC_DEPTH = "metric_depth"
    # [B, H, W], uint8, instance mask (0 is background).
    DYNAMIC_INSTANCE_MASK = "dynamic_instance_mask"
    # [B, H, W], float32, backward flow from this frame to previous frame.
    BACKWARD_FLOW = "backward_flow"
    # [B, H, W, 3], float32, ray direction (assume no motion/RS).
    RAY_DIRECTION = "ray_direction"
    # TODO [Add description]
    OBJECT_BBOX = "object_bbox"
    # TODO [Add description] a list of float32 point cloud.
    POINT_CLOUD = "point_cloud"
    # [B, N, (3 + 3x3)], N future positions. For the last dim,
    # the first 3 are xyz locations, and tha last 9 are rots
    # B corresponds to the number of timestamps for the base camera type
    TRAJECTORY = "trajectory"
    # [V,] dictionary of meta data
    META_DATA = "meta_data"
    # [V, N, C] N is variable for different V float32
    LANGUAGE_EMBEDDING = "language_embedding"
    # [B, C, T, H, W], float32, latent image
    LATENT_RGB = "latent_rgb"