File size: 2,813 Bytes
a2fbe0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
{
  "architectures": [
    "VFMMultiFrameTransformer"
  ],
  "chosen_layers": [
    4,
    11,
    17,
    23
  ],
  "ffn_layer": "mlp",
  "geometry_aggregator": false,
  "geometry_aggregator_layer": 6,
  "grounding_ratio": 0.5,
  "hidden_act": "gelu",
  "hidden_size": 1024,
  "image_aggregator": false,
  "image_aggregator_layer": 6,
  "image_size": 224,
  "image_ssl": {
    "compute_precision": {
      "sharding_strategy": "SHARD_GRAD_OP"
    },
    "crops": {
      "local_crops_number": 2
    },
    "dino": {
      "force_weight_norm": false,
      "global_ignore_diagonal": true,
      "head_bottleneck_dim": 256,
      "head_hidden_dim": 2048,
      "head_n_prototypes": 65536,
      "head_nlayers": 3,
      "head_norm_last_layer": false,
      "koleo_distributed_replicas": 0,
      "koleo_loss_distributed": false,
      "koleo_loss_weight": 0.1,
      "koleo_topk": 1,
      "local_loss_weight_schedule": {
        "end": 0.5,
        "peak": 0.5,
        "start": 0.5,
        "warmup_epochs": 0
      },
      "loss_weight": 1.0,
      "reweight_dino_local_loss": false
    },
    "distillation": {
      "checkpoint_path": "",
      "enabled": false,
      "full_cfg_path": ""
    },
    "gram": {
      "ckpt": null,
      "compute_stats": false,
      "ema_teacher": false,
      "global_teacher_resize_antialias": false,
      "global_teacher_resize_method": "bicubic",
      "img_level": true,
      "it_first_update": 0,
      "it_load_ema_teacher": -1,
      "loss_weight": 1.0,
      "loss_weight_schedule": null,
      "max_updates": null,
      "normalized": true,
      "remove_neg": false,
      "remove_only_teacher_neg": false,
      "rep_update": true,
      "tokens_used": "all",
      "update_frequency": 50000,
      "use_loss": true
    },
    "ibot": {
      "force_masking_even_with_zero_weight": false,
      "head_bottleneck_dim": 256,
      "head_hidden_dim": 2048,
      "head_n_prototypes": 65536,
      "head_nlayers": 3,
      "head_norm_last_layer": false,
      "loss_weight": 1.0,
      "mask_random_circular_shift": false,
      "mask_ratio_min_max": [
        0.1,
        0.5
      ],
      "mask_sample_probability": 0.5,
      "separate_head": true
    },
    "multidistillation": {
      "enabled": false
    },
    "train": {
      "centering": "sinkhorn_knopp"
    }
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-06,
  "mlp_ratio": 4.0,
  "mm_projector_type": "mlp2x_gelu",
  "model_type": "vfm",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_experts": 8,
  "num_frames": 16,
  "patch_embed_name": "dinov3_vitl16_torch",
  "patch_size": 16,
  "top_k": 2,
  "torch_dtype": "float32",
  "transformers_version": "4.52.3",
  "upcycle_to_moe": false,
  "video_aggregator": true,
  "video_aggregator_layer": 24
}