| MODEL: |
| WEIGHTS: '' |
| compute_precision: |
| grad_scaler: true |
| teacher: |
| backbone: |
| sharding_strategy: SHARD_GRAD_OP |
| mixed_precision: |
| param_dtype: fp16 |
| reduce_dtype: fp16 |
| buffer_dtype: fp32 |
| dino_head: |
| sharding_strategy: SHARD_GRAD_OP |
| mixed_precision: |
| param_dtype: fp16 |
| reduce_dtype: fp16 |
| buffer_dtype: fp32 |
| ibot_head: |
| sharding_strategy: SHARD_GRAD_OP |
| mixed_precision: |
| param_dtype: fp16 |
| reduce_dtype: fp16 |
| buffer_dtype: fp32 |
| student: |
| backbone: |
| sharding_strategy: SHARD_GRAD_OP |
| mixed_precision: |
| param_dtype: fp16 |
| reduce_dtype: fp16 |
| buffer_dtype: fp32 |
| dino_head: |
| sharding_strategy: SHARD_GRAD_OP |
| mixed_precision: |
| param_dtype: fp16 |
| reduce_dtype: fp32 |
| buffer_dtype: fp32 |
| ibot_head: |
| sharding_strategy: SHARD_GRAD_OP |
| mixed_precision: |
| param_dtype: fp16 |
| reduce_dtype: fp32 |
| buffer_dtype: fp32 |
| dino: |
| loss_weight: 1.0 |
| head_n_prototypes: 65536 |
| head_bottleneck_dim: 256 |
| head_nlayers: 3 |
| head_hidden_dim: 2048 |
| koleo_loss_weight: 0.1 |
| ibot: |
| loss_weight: 1.0 |
| mask_sample_probability: 0.5 |
| mask_ratio_min_max: |
| - 0.1 |
| - 0.5 |
| separate_head: false |
| head_n_prototypes: 65536 |
| head_bottleneck_dim: 256 |
| head_nlayers: 3 |
| head_hidden_dim: 2048 |
| train: |
| batch_size_per_gpu: 64 |
| dataset_path: ImageNet:split=TRAIN |
| output_dir: . |
| saveckp_freq: 20 |
| seed: 0 |
| num_workers: 10 |
| OFFICIAL_EPOCH_LENGTH: 1250 |
| cache_dataset: true |
| centering: "centering" |
| student: |
| arch: vit_large |
| patch_size: 16 |
| drop_path_rate: 0.3 |
| layerscale: 1.0e-05 |
| drop_path_uniform: true |
| pretrained_weights: '' |
| ffn_layer: "mlp" |
| block_chunks: 0 |
| qkv_bias: true |
| proj_bias: true |
| ffn_bias: true |
| num_register_tokens: 0 |
| interpolate_antialias: false |
| interpolate_offset: 0.1 |
| teacher: |
| momentum_teacher: 0.992 |
| final_momentum_teacher: 1 |
| warmup_teacher_temp: 0.04 |
| teacher_temp: 0.07 |
| warmup_teacher_temp_epochs: 30 |
| optim: |
| epochs: 100 |
| weight_decay: 0.04 |
| weight_decay_end: 0.4 |
| base_lr: 0.004 |
| lr: 0. |
| warmup_epochs: 10 |
| min_lr: 1.0e-06 |
| clip_grad: 3.0 |
| freeze_last_layer_epochs: 1 |
| scaling_rule: sqrt_wrt_1024 |
| patch_embed_lr_mult: 0.2 |
| layerwise_decay: 0.9 |
| adamw_beta1: 0.9 |
| adamw_beta2: 0.999 |
| crops: |
| global_crops_scale: |
| - 0.32 |
| - 1.0 |
| local_crops_number: 8 |
| local_crops_scale: |
| - 0.05 |
| - 0.32 |
| global_crops_size: 224 |
| local_crops_size: 96 |
| evaluation: |
| eval_period_iterations: 12500 |