{ "data": { "augmentations": { "affine_p": 0.0, "blur_p": 0.1, "cut_p": 0.0, "flip_p": 0.5, "flipt_p": 0.3, "gamma_p": 0.5, "grayscale_p": 0.1, "invert_p": 0.0, "jitter_p": 0.5, "noise_pad": 1.0, "only_zoom": true, "random_blur": 2.0, "random_gamma": 0.2, "random_jitter": 0.1, "random_rotation": 0.0, "random_scale": 2.0, "random_shear": 0.0, "random_translate_x": 0.04, "random_translate_y": 0.01, "rotation_p": 0.0, "scale_p": 0.0, "shape_constraints": { "height_min": 15, "pixels_max": 600000.0, "pixels_min": 200000.0, "ratio_bounds": [ 0.5, 2.5 ], "sample": true, "shape_mult": 14, "width_min": 15 }, "shape_mult": 14, "test_context": 1.0, "translate_p": 0.0 }, "crop": "garg", "data_root": "datasets", "flow": "of", "image_shape": [ 518, 518 ], "keepGT": 0, "mini": 1.0, "normalization": "imagenet", "num_frames": 2, "pair": 1, "resize_method": "contextcrop", "sampling": {}, "shape_constraints": { "height_min": 15, "pixels_max": 600000.0, "pixels_min": 200000.0, "ratio_bounds": [ 0.5, 2.5 ], "sample": true, "shape_mult": 14, "width_min": 15 }, "train_datasets": [], "val_datasets": [ "ScanNetVid", "VKITTI", "Bonn", "TUM", "Sintel" ] }, "eps": 1e-06, "generic": { "deterministic": true, "name_page": "velodepth", "seed": 42 }, "model": { "expansion": 4, "flow_encoder": { "embed_dims": [ 80, 160 ], "frozen_stages": -1, "name": "convnextv2_nano", "num_levels": 2, "pretrained": "timm" }, "layer_scale": 1.0, "name": "VeloDepth", "num_heads": 8, "pixel_decoder": { "depths": [ 2, 2, 2 ], "dropout": 0.0, "hidden_dim": 512, "kernel_size": 3, "name": "Decoder", "num_fusion_block": 1, "num_prompt_blocks": 1, "out_dim": 64 }, "pixel_encoder": { "cls_token_embed_dims": [ 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 ], "depths": [ 6, 12, 18, 24 ], "embed_dim": 1024, "embed_dims": [ 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 ], "freeze_norm": true, "frozen_stages": 0, "lr": 3e-06, "name": "dinov2_vitl14", "num_register_tokens": 0, "output_idx": [ 6, 12, 18, 24 ], "patch_size": 14, "pretrained": null, "stacking_fn": "last", "use_norm": true, "wd": 0.1 }, "residual_encoder": { "embed_dim": 96, "embed_dims": [ 96, 192, 384, 768 ], "frozen_stages": 0, "lr": 0.0001, "name": "convnextv2_tiny", "num_levels": 1, "pretrained": "timm", "wd": 0.01 } }, "training": { "f16": "f16", "losses": { "camera": { "name": "Dummy", "weight": 1.0 }, "depth": { "name": "Dummy", "weight": 1.0 }, "edge": { "name": "Dummy", "weight": 1.0 }, "features": { "name": "Dummy", "weight": 1.0 }, "flow": { "name": "Dummy", "weight": 1.0 }, "self": { "name": "Dummy", "weight": 1.0 } } } }