lpiccinelli
/

velodepth

model_hub_mixin

monocular-metric-3D-estimation

pytorch_model_hub_mixin

Model card Files Files and versions

velodepth / config.json

lpiccinelli's picture

Push model using huggingface_hub.

2b2ee9f verified about 1 month ago

history blame contribute delete

4.23 kB

	{
	"data": {
	"augmentations": {
	"affine_p": 0.0,
	"blur_p": 0.1,
	"cut_p": 0.0,
	"flip_p": 0.5,
	"flipt_p": 0.3,
	"gamma_p": 0.5,
	"grayscale_p": 0.1,
	"invert_p": 0.0,
	"jitter_p": 0.5,
	"noise_pad": 1.0,
	"only_zoom": true,
	"random_blur": 2.0,
	"random_gamma": 0.2,
	"random_jitter": 0.1,
	"random_rotation": 0.0,
	"random_scale": 2.0,
	"random_shear": 0.0,
	"random_translate_x": 0.04,
	"random_translate_y": 0.01,
	"rotation_p": 0.0,
	"scale_p": 0.0,
	"shape_constraints": {
	"height_min": 15,
	"pixels_max": 600000.0,
	"pixels_min": 200000.0,
	"ratio_bounds": [
	0.5,
	2.5
	],
	"sample": true,
	"shape_mult": 14,
	"width_min": 15
	},
	"shape_mult": 14,
	"test_context": 1.0,
	"translate_p": 0.0
	},
	"crop": "garg",
	"data_root": "datasets",
	"flow": "of",
	"image_shape": [
	518,
	518
	],
	"keepGT": 0,
	"mini": 1.0,
	"normalization": "imagenet",
	"num_frames": 2,
	"pair": 1,
	"resize_method": "contextcrop",
	"sampling": {},
	"shape_constraints": {
	"height_min": 15,
	"pixels_max": 600000.0,
	"pixels_min": 200000.0,
	"ratio_bounds": [
	0.5,
	2.5
	],
	"sample": true,
	"shape_mult": 14,
	"width_min": 15
	},
	"train_datasets": [],
	"val_datasets": [
	"ScanNetVid",
	"VKITTI",
	"Bonn",
	"TUM",
	"Sintel"
	]
	},
	"eps": 1e-06,
	"generic": {
	"deterministic": true,
	"name_page": "velodepth",
	"seed": 42
	},
	"model": {
	"expansion": 4,
	"flow_encoder": {
	"embed_dims": [
	80,
	160
	],
	"frozen_stages": -1,
	"name": "convnextv2_nano",
	"num_levels": 2,
	"pretrained": "timm"
	},
	"layer_scale": 1.0,
	"name": "VeloDepth",
	"num_heads": 8,
	"pixel_decoder": {
	"depths": [
	2,
	2,
	2
	],
	"dropout": 0.0,
	"hidden_dim": 512,
	"kernel_size": 3,
	"name": "Decoder",
	"num_fusion_block": 1,
	"num_prompt_blocks": 1,
	"out_dim": 64
	},
	"pixel_encoder": {
	"cls_token_embed_dims": [
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024
	],
	"depths": [
	6,
	12,
	18,
	24
	],
	"embed_dim": 1024,
	"embed_dims": [
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024,
	1024
	],
	"freeze_norm": true,
	"frozen_stages": 0,
	"lr": 3e-06,
	"name": "dinov2_vitl14",
	"num_register_tokens": 0,
	"output_idx": [
	6,
	12,
	18,
	24
	],
	"patch_size": 14,
	"pretrained": null,
	"stacking_fn": "last",
	"use_norm": true,
	"wd": 0.1
	},
	"residual_encoder": {
	"embed_dim": 96,
	"embed_dims": [
	96,
	192,
	384,
	768
	],
	"frozen_stages": 0,
	"lr": 0.0001,
	"name": "convnextv2_tiny",
	"num_levels": 1,
	"pretrained": "timm",
	"wd": 0.01
	}
	},
	"training": {
	"f16": "f16",
	"losses": {
	"camera": {
	"name": "Dummy",
	"weight": 1.0
	},
	"depth": {
	"name": "Dummy",
	"weight": 1.0
	},
	"edge": {
	"name": "Dummy",
	"weight": 1.0
	},
	"features": {
	"name": "Dummy",
	"weight": 1.0
	},
	"flow": {
	"name": "Dummy",
	"weight": 1.0
	},
	"self": {
	"name": "Dummy",
	"weight": 1.0
	}
	}
	}
	}