velodepth / config.json
lpiccinelli's picture
Push model using huggingface_hub.
2b2ee9f verified
{
"data": {
"augmentations": {
"affine_p": 0.0,
"blur_p": 0.1,
"cut_p": 0.0,
"flip_p": 0.5,
"flipt_p": 0.3,
"gamma_p": 0.5,
"grayscale_p": 0.1,
"invert_p": 0.0,
"jitter_p": 0.5,
"noise_pad": 1.0,
"only_zoom": true,
"random_blur": 2.0,
"random_gamma": 0.2,
"random_jitter": 0.1,
"random_rotation": 0.0,
"random_scale": 2.0,
"random_shear": 0.0,
"random_translate_x": 0.04,
"random_translate_y": 0.01,
"rotation_p": 0.0,
"scale_p": 0.0,
"shape_constraints": {
"height_min": 15,
"pixels_max": 600000.0,
"pixels_min": 200000.0,
"ratio_bounds": [
0.5,
2.5
],
"sample": true,
"shape_mult": 14,
"width_min": 15
},
"shape_mult": 14,
"test_context": 1.0,
"translate_p": 0.0
},
"crop": "garg",
"data_root": "datasets",
"flow": "of",
"image_shape": [
518,
518
],
"keepGT": 0,
"mini": 1.0,
"normalization": "imagenet",
"num_frames": 2,
"pair": 1,
"resize_method": "contextcrop",
"sampling": {},
"shape_constraints": {
"height_min": 15,
"pixels_max": 600000.0,
"pixels_min": 200000.0,
"ratio_bounds": [
0.5,
2.5
],
"sample": true,
"shape_mult": 14,
"width_min": 15
},
"train_datasets": [],
"val_datasets": [
"ScanNetVid",
"VKITTI",
"Bonn",
"TUM",
"Sintel"
]
},
"eps": 1e-06,
"generic": {
"deterministic": true,
"name_page": "velodepth",
"seed": 42
},
"model": {
"expansion": 4,
"flow_encoder": {
"embed_dims": [
80,
160
],
"frozen_stages": -1,
"name": "convnextv2_nano",
"num_levels": 2,
"pretrained": "timm"
},
"layer_scale": 1.0,
"name": "VeloDepth",
"num_heads": 8,
"pixel_decoder": {
"depths": [
2,
2,
2
],
"dropout": 0.0,
"hidden_dim": 512,
"kernel_size": 3,
"name": "Decoder",
"num_fusion_block": 1,
"num_prompt_blocks": 1,
"out_dim": 64
},
"pixel_encoder": {
"cls_token_embed_dims": [
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024
],
"depths": [
6,
12,
18,
24
],
"embed_dim": 1024,
"embed_dims": [
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024,
1024
],
"freeze_norm": true,
"frozen_stages": 0,
"lr": 3e-06,
"name": "dinov2_vitl14",
"num_register_tokens": 0,
"output_idx": [
6,
12,
18,
24
],
"patch_size": 14,
"pretrained": null,
"stacking_fn": "last",
"use_norm": true,
"wd": 0.1
},
"residual_encoder": {
"embed_dim": 96,
"embed_dims": [
96,
192,
384,
768
],
"frozen_stages": 0,
"lr": 0.0001,
"name": "convnextv2_tiny",
"num_levels": 1,
"pretrained": "timm",
"wd": 0.01
}
},
"training": {
"f16": "f16",
"losses": {
"camera": {
"name": "Dummy",
"weight": 1.0
},
"depth": {
"name": "Dummy",
"weight": 1.0
},
"edge": {
"name": "Dummy",
"weight": 1.0
},
"features": {
"name": "Dummy",
"weight": 1.0
},
"flow": {
"name": "Dummy",
"weight": 1.0
},
"self": {
"name": "Dummy",
"weight": 1.0
}
}
}
}