Add files using upload-large-folder tool

a9eed45 verified 10 months ago

29.4 kB

	2025-04-29,17:42:33 \| INFO \| No latest resume checkpoint found in /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/checkpoints.
	2025-04-29,17:42:35 \| INFO \| Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
	2025-04-29,17:42:35 \| INFO \| Loaded ViT-B-32 model config.
	2025-04-29,17:42:36 \| INFO \| Model:
	2025-04-29,17:42:36 \| INFO \| CLIP(
	(visual): VisionTransformer(
	(patchnorm_pre_ln): Identity()
	(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
	(patch_dropout): Identity()
	(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(token_embedding): Embedding(49408, 512)
	(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	)
	2025-04-29,17:42:36 \| INFO \| Params:
	2025-04-29,17:42:36 \| INFO \| accum_freq: 1
	2025-04-29,17:42:36 \| INFO \| aug_cfg: {}
	2025-04-29,17:42:36 \| INFO \| batch_size: 2048
	2025-04-29,17:42:36 \| INFO \| beta1: 0.9
	2025-04-29,17:42:36 \| INFO \| beta2: 0.98
	2025-04-29,17:42:36 \| INFO \| checkpoint_path: /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/checkpoints
	2025-04-29,17:42:36 \| INFO \| coca_caption_loss_weight: 2.0
	2025-04-29,17:42:36 \| INFO \| coca_contrastive_loss_weight: 1.0
	2025-04-29,17:42:36 \| INFO \| copy_codebase: False
	2025-04-29,17:42:36 \| INFO \| csv_caption_key: title
	2025-04-29,17:42:36 \| INFO \| csv_img_key: filepath
	2025-04-29,17:42:36 \| INFO \| csv_separator:
	2025-04-29,17:42:36 \| INFO \| dataset_resampled: True
	2025-04-29,17:42:36 \| INFO \| dataset_type: webdataset
	2025-04-29,17:42:36 \| INFO \| ddp_static_graph: True
	2025-04-29,17:42:36 \| INFO \| debug: False
	2025-04-29,17:42:36 \| INFO \| delete_previous_checkpoint: False
	2025-04-29,17:42:36 \| INFO \| device: cuda:0
	2025-04-29,17:42:36 \| INFO \| dist_backend: nccl
	2025-04-29,17:42:36 \| INFO \| dist_url: env://
	2025-04-29,17:42:36 \| INFO \| distill: False
	2025-04-29,17:42:36 \| INFO \| distill_model: None
	2025-04-29,17:42:36 \| INFO \| distill_pretrained: None
	2025-04-29,17:42:36 \| INFO \| distributed: True
	2025-04-29,17:42:36 \| INFO \| epochs: 8
	2025-04-29,17:42:36 \| INFO \| epochs_cooldown: None
	2025-04-29,17:42:36 \| INFO \| eps: 1e-06
	2025-04-29,17:42:36 \| INFO \| force_custom_text: False
	2025-04-29,17:42:36 \| INFO \| force_image_size: None
	2025-04-29,17:42:36 \| INFO \| force_patch_dropout: None
	2025-04-29,17:42:36 \| INFO \| force_quick_gelu: False
	2025-04-29,17:42:36 \| INFO \| gather_with_grad: True
	2025-04-29,17:42:36 \| INFO \| grad_checkpointing: True
	2025-04-29,17:42:36 \| INFO \| grad_clip_norm: None
	2025-04-29,17:42:36 \| INFO \| horovod: False
	2025-04-29,17:42:36 \| INFO \| image_mean: None
	2025-04-29,17:42:36 \| INFO \| image_std: None
	2025-04-29,17:42:36 \| INFO \| imagenet_v2: None
	2025-04-29,17:42:36 \| INFO \| imagenet_val: None
	2025-04-29,17:42:36 \| INFO \| local_loss: True
	2025-04-29,17:42:36 \| INFO \| local_rank: 0
	2025-04-29,17:42:36 \| INFO \| lock_image: False
	2025-04-29,17:42:36 \| INFO \| lock_image_freeze_bn_stats: False
	2025-04-29,17:42:36 \| INFO \| lock_image_unlocked_groups: 0
	2025-04-29,17:42:36 \| INFO \| lock_text: False
	2025-04-29,17:42:36 \| INFO \| lock_text_freeze_layer_norm: False
	2025-04-29,17:42:36 \| INFO \| lock_text_unlocked_layers: 0
	2025-04-29,17:42:36 \| INFO \| log_every_n_steps: 100
	2025-04-29,17:42:36 \| INFO \| log_level: 20
	2025-04-29,17:42:36 \| INFO \| log_local: False
	2025-04-29,17:42:36 \| INFO \| log_path: /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/out.log
	2025-04-29,17:42:36 \| INFO \| logs: /mnt/personal/zhudongy/datacomp_results/medium
	2025-04-29,17:42:36 \| INFO \| lr: 0.0005
	2025-04-29,17:42:36 \| INFO \| lr_cooldown_end: 0.0
	2025-04-29,17:42:36 \| INFO \| lr_cooldown_power: 1.0
	2025-04-29,17:42:36 \| INFO \| lr_scheduler: cosine
	2025-04-29,17:42:36 \| INFO \| model: ViT-B-32
	2025-04-29,17:42:36 \| INFO \| name: low_inter_only
	2025-04-29,17:42:36 \| INFO \| no_set_device_rank: False
	2025-04-29,17:42:36 \| INFO \| precision: amp_bfloat16
	2025-04-29,17:42:36 \| INFO \| pretrained:
	2025-04-29,17:42:36 \| INFO \| pretrained_image: False
	2025-04-29,17:42:36 \| INFO \| rank: 0
	2025-04-29,17:42:36 \| INFO \| remote_sync: None
	2025-04-29,17:42:36 \| INFO \| remote_sync_frequency: 300
	2025-04-29,17:42:36 \| INFO \| remote_sync_protocol: s3
	2025-04-29,17:42:36 \| INFO \| report_to:
	2025-04-29,17:42:36 \| INFO \| resume: None
	2025-04-29,17:42:36 \| INFO \| save_frequency: 0
	2025-04-29,17:42:36 \| INFO \| save_most_recent: True
	2025-04-29,17:42:36 \| INFO \| seed: 0
	2025-04-29,17:42:36 \| INFO \| skip_scheduler: False
	2025-04-29,17:42:36 \| INFO \| tensorboard: False
	2025-04-29,17:42:36 \| INFO \| tensorboard_path:
	2025-04-29,17:42:36 \| INFO \| torchscript: False
	2025-04-29,17:42:36 \| INFO \| trace: False
	2025-04-29,17:42:36 \| INFO \| train_data: /mnt/personal/zhudongy/datacomp-medium/shards/0000{0000..6126}.tar
	2025-04-29,17:42:36 \| INFO \| train_data_upsampling_factors: None
	2025-04-29,17:42:36 \| INFO \| train_num_samples: 16000000
	2025-04-29,17:42:36 \| INFO \| use_bn_sync: False
	2025-04-29,17:42:36 \| INFO \| val_data: None
	2025-04-29,17:42:36 \| INFO \| val_frequency: 1
	2025-04-29,17:42:36 \| INFO \| val_num_samples: None
	2025-04-29,17:42:36 \| INFO \| wandb: False
	2025-04-29,17:42:36 \| INFO \| wandb_notes:
	2025-04-29,17:42:36 \| INFO \| wandb_project_name: open-clip
	2025-04-29,17:42:36 \| INFO \| warmup: 500
	2025-04-29,17:42:36 \| INFO \| wd: 0.2
	2025-04-29,17:42:36 \| INFO \| workers: 16
	2025-04-29,17:42:36 \| INFO \| world_size: 2
	2025-04-29,17:42:36 \| INFO \| zeroshot_frequency: 2
	2025-04-29,17:42:36 \| INFO \| Start epoch 0
	2025-04-29,17:43:01 \| INFO \| Train Epoch: 0 [ 4096/16056320 (0%)] Data (t): 21.925 Batch (t): 24.947, 164.187/s, 82.0936/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3837 (8.3837) Loss: 8.3837 (8.3837)
	2025-04-29,17:43:04 \| INFO \| Reducer buckets have been rebuilt in this iteration.
	2025-04-29,17:47:03 \| INFO \| Train Epoch: 0 [ 413696/16056320 (3%)] Data (t): 0.367 Batch (t): 2.418, 1701.28/s, 850.642/s/gpu LR: 0.000101 Logit Scale: 14.261 Contrastive_loss: 8.1890 (8.2863) Loss: 8.1890 (8.2863)
	2025-04-29,17:51:07 \| INFO \| Train Epoch: 0 [ 823296/16056320 (5%)] Data (t): 0.400 Batch (t): 2.442, 1697.43/s, 848.716/s/gpu LR: 0.000201 Logit Scale: 14.237 Contrastive_loss: 8.0558 (8.2095) Loss: 8.0558 (8.2095)
	2025-04-29,17:55:15 \| INFO \| Train Epoch: 0 [ 1232896/16056320 (8%)] Data (t): 0.436 Batch (t): 2.479, 1693.55/s, 846.774/s/gpu LR: 0.000301 Logit Scale: 14.210 Contrastive_loss: 7.9360 (8.1411) Loss: 7.9360 (8.1411)
	2025-04-29,17:59:16 \| INFO \| Train Epoch: 0 [ 1642496/16056320 (10%)] Data (t): 0.372 Batch (t): 2.411, 1693.62/s, 846.809/s/gpu LR: 0.000401 Logit Scale: 14.185 Contrastive_loss: 7.8394 (8.0808) Loss: 7.8394 (8.0808)
	2025-04-29,18:03:17 \| INFO \| Train Epoch: 0 [ 2052096/16056320 (13%)] Data (t): 0.364 Batch (t): 2.407, 1698.74/s, 849.369/s/gpu LR: 0.000500 Logit Scale: 14.182 Contrastive_loss: 7.7843 (8.0314) Loss: 7.7843 (8.0314)
	2025-04-29,18:07:18 \| INFO \| Train Epoch: 0 [ 2461696/16056320 (15%)] Data (t): 0.369 Batch (t): 2.413, 1692.95/s, 846.474/s/gpu LR: 0.000500 Logit Scale: 14.201 Contrastive_loss: 7.6211 (7.9728) Loss: 7.6211 (7.9728)
	2025-04-29,18:11:19 \| INFO \| Train Epoch: 0 [ 2871296/16056320 (18%)] Data (t): 0.365 Batch (t): 2.407, 1707.98/s, 853.988/s/gpu LR: 0.000500 Logit Scale: 14.260 Contrastive_loss: 7.5299 (7.9174) Loss: 7.5299 (7.9174)
	2025-04-29,18:15:20 \| INFO \| Train Epoch: 0 [ 3280896/16056320 (20%)] Data (t): 0.369 Batch (t): 2.410, 1690.42/s, 845.211/s/gpu LR: 0.000500 Logit Scale: 14.340 Contrastive_loss: 7.4407 (7.8644) Loss: 7.4407 (7.8644)
	2025-04-29,18:19:24 \| INFO \| Train Epoch: 0 [ 3690496/16056320 (23%)] Data (t): 0.393 Batch (t): 2.436, 1701.12/s, 850.559/s/gpu LR: 0.000500 Logit Scale: 14.441 Contrastive_loss: 7.4649 (7.8245) Loss: 7.4649 (7.8245)
	2025-04-29,18:23:25 \| INFO \| Train Epoch: 0 [ 4100096/16056320 (26%)] Data (t): 0.378 Batch (t): 2.417, 1712.22/s, 856.110/s/gpu LR: 0.000500 Logit Scale: 14.582 Contrastive_loss: 7.3469 (7.7811) Loss: 7.3469 (7.7811)
	2025-04-29,18:27:27 \| INFO \| Train Epoch: 0 [ 4509696/16056320 (28%)] Data (t): 0.372 Batch (t): 2.413, 1699.89/s, 849.943/s/gpu LR: 0.000500 Logit Scale: 14.727 Contrastive_loss: 7.2976 (7.7408) Loss: 7.2976 (7.7408)
	2025-04-29,18:31:29 \| INFO \| Train Epoch: 0 [ 4919296/16056320 (31%)] Data (t): 0.378 Batch (t): 2.418, 1680.76/s, 840.382/s/gpu LR: 0.000499 Logit Scale: 14.942 Contrastive_loss: 7.2187 (7.7006) Loss: 7.2187 (7.7006)
	2025-04-29,18:35:30 \| INFO \| Train Epoch: 0 [ 5328896/16056320 (33%)] Data (t): 0.375 Batch (t): 2.417, 1677.33/s, 838.665/s/gpu LR: 0.000499 Logit Scale: 15.128 Contrastive_loss: 7.1773 (7.6632) Loss: 7.1773 (7.6632)
	2025-04-29,18:39:32 \| INFO \| Train Epoch: 0 [ 5738496/16056320 (36%)] Data (t): 0.373 Batch (t): 2.416, 1698.13/s, 849.067/s/gpu LR: 0.000499 Logit Scale: 15.344 Contrastive_loss: 7.0983 (7.6256) Loss: 7.0983 (7.6256)
	2025-04-29,18:43:35 \| INFO \| Train Epoch: 0 [ 6148096/16056320 (38%)] Data (t): 0.390 Batch (t): 2.433, 1681.07/s, 840.537/s/gpu LR: 0.000499 Logit Scale: 15.595 Contrastive_loss: 7.0212 (7.5878) Loss: 7.0212 (7.5878)
	2025-04-29,18:47:38 \| INFO \| Train Epoch: 0 [ 6557696/16056320 (41%)] Data (t): 0.390 Batch (t): 2.430, 1683.60/s, 841.802/s/gpu LR: 0.000498 Logit Scale: 15.826 Contrastive_loss: 6.9587 (7.5508) Loss: 6.9587 (7.5508)
	2025-04-29,18:51:40 \| INFO \| Train Epoch: 0 [ 6967296/16056320 (43%)] Data (t): 0.378 Batch (t): 2.418, 1698.91/s, 849.455/s/gpu LR: 0.000498 Logit Scale: 16.115 Contrastive_loss: 6.8671 (7.5128) Loss: 6.8671 (7.5128)
	2025-04-29,18:55:42 \| INFO \| Train Epoch: 0 [ 7376896/16056320 (46%)] Data (t): 0.380 Batch (t): 2.423, 1679.70/s, 839.848/s/gpu LR: 0.000498 Logit Scale: 16.413 Contrastive_loss: 6.9072 (7.4809) Loss: 6.9072 (7.4809)
	2025-04-29,18:59:46 \| INFO \| Train Epoch: 0 [ 7786496/16056320 (48%)] Data (t): 0.390 Batch (t): 2.432, 1697.74/s, 848.868/s/gpu LR: 0.000497 Logit Scale: 16.734 Contrastive_loss: 6.8222 (7.4480) Loss: 6.8222 (7.4480)
	2025-04-29,19:03:50 \| INFO \| Train Epoch: 0 [ 8196096/16056320 (51%)] Data (t): 0.392 Batch (t): 2.440, 1684.68/s, 842.340/s/gpu LR: 0.000497 Logit Scale: 17.017 Contrastive_loss: 6.7487 (7.4147) Loss: 6.7487 (7.4147)
	2025-04-29,19:07:55 \| INFO \| Train Epoch: 0 [ 8605696/16056320 (54%)] Data (t): 0.414 Batch (t): 2.455, 1699.34/s, 849.669/s/gpu LR: 0.000497 Logit Scale: 17.283 Contrastive_loss: 6.6749 (7.3811) Loss: 6.6749 (7.3811)
	2025-04-29,19:11:58 \| INFO \| Train Epoch: 0 [ 9015296/16056320 (56%)] Data (t): 0.384 Batch (t): 2.425, 1704.23/s, 852.116/s/gpu LR: 0.000496 Logit Scale: 17.559 Contrastive_loss: 6.5998 (7.3471) Loss: 6.5998 (7.3471)
	2025-04-29,19:16:01 \| INFO \| Train Epoch: 0 [ 9424896/16056320 (59%)] Data (t): 0.387 Batch (t): 2.438, 1666.65/s, 833.323/s/gpu LR: 0.000496 Logit Scale: 17.832 Contrastive_loss: 6.5839 (7.3153) Loss: 6.5839 (7.3153)
	2025-04-29,19:20:05 \| INFO \| Train Epoch: 0 [ 9834496/16056320 (61%)] Data (t): 0.393 Batch (t): 2.432, 1699.78/s, 849.890/s/gpu LR: 0.000495 Logit Scale: 18.053 Contrastive_loss: 6.4805 (7.2819) Loss: 6.4805 (7.2819)
	2025-04-29,19:24:07 \| INFO \| Train Epoch: 0 [10244096/16056320 (64%)] Data (t): 0.382 Batch (t): 2.425, 1685.49/s, 842.747/s/gpu LR: 0.000495 Logit Scale: 18.360 Contrastive_loss: 6.5788 (7.2549) Loss: 6.5788 (7.2549)
	2025-04-29,19:28:10 \| INFO \| Train Epoch: 0 [10653696/16056320 (66%)] Data (t): 0.391 Batch (t): 2.431, 1693.68/s, 846.839/s/gpu LR: 0.000494 Logit Scale: 18.648 Contrastive_loss: 6.4210 (7.2240) Loss: 6.4210 (7.2240)
	2025-04-29,19:32:14 \| INFO \| Train Epoch: 0 [11063296/16056320 (69%)] Data (t): 0.396 Batch (t): 2.438, 1692.32/s, 846.162/s/gpu LR: 0.000494 Logit Scale: 18.913 Contrastive_loss: 6.4979 (7.1981) Loss: 6.4979 (7.1981)
	2025-04-29,19:36:17 \| INFO \| Train Epoch: 0 [11472896/16056320 (71%)] Data (t): 0.390 Batch (t): 2.429, 1678.49/s, 839.246/s/gpu LR: 0.000493 Logit Scale: 19.224 Contrastive_loss: 6.4005 (7.1706) Loss: 6.4005 (7.1706)
	2025-04-29,19:40:20 \| INFO \| Train Epoch: 0 [11882496/16056320 (74%)] Data (t): 0.387 Batch (t): 2.430, 1678.26/s, 839.130/s/gpu LR: 0.000493 Logit Scale: 19.531 Contrastive_loss: 6.3682 (7.1438) Loss: 6.3682 (7.1438)
	2025-04-29,19:44:23 \| INFO \| Train Epoch: 0 [12292096/16056320 (77%)] Data (t): 0.396 Batch (t): 2.436, 1677.17/s, 838.587/s/gpu LR: 0.000492 Logit Scale: 19.819 Contrastive_loss: 6.1071 (7.1104) Loss: 6.1071 (7.1104)
	2025-04-29,19:48:27 \| INFO \| Train Epoch: 0 [12701696/16056320 (79%)] Data (t): 0.398 Batch (t): 2.438, 1696.85/s, 848.426/s/gpu LR: 0.000491 Logit Scale: 20.042 Contrastive_loss: 6.3618 (7.0870) Loss: 6.3618 (7.0870)
	2025-04-29,19:52:31 \| INFO \| Train Epoch: 0 [13111296/16056320 (82%)] Data (t): 0.398 Batch (t): 2.439, 1606.99/s, 803.494/s/gpu LR: 0.000491 Logit Scale: 20.290 Contrastive_loss: 6.2285 (7.0610) Loss: 6.2285 (7.0610)
	2025-04-29,19:56:35 \| INFO \| Train Epoch: 0 [13520896/16056320 (84%)] Data (t): 0.402 Batch (t): 2.443, 1668.95/s, 834.473/s/gpu LR: 0.000490 Logit Scale: 20.522 Contrastive_loss: 6.0085 (7.0300) Loss: 6.0085 (7.0300)
	2025-04-29,20:00:38 \| INFO \| Train Epoch: 0 [13930496/16056320 (87%)] Data (t): 0.389 Batch (t): 2.429, 1705.41/s, 852.704/s/gpu LR: 0.000489 Logit Scale: 20.775 Contrastive_loss: 6.2236 (7.0070) Loss: 6.2236 (7.0070)
	2025-04-29,20:04:42 \| INFO \| Train Epoch: 0 [14340096/16056320 (89%)] Data (t): 0.397 Batch (t): 2.439, 1616.17/s, 808.085/s/gpu LR: 0.000488 Logit Scale: 21.015 Contrastive_loss: 6.1521 (6.9832) Loss: 6.1521 (6.9832)
	2025-04-29,20:08:46 \| INFO \| Train Epoch: 0 [14749696/16056320 (92%)] Data (t): 0.396 Batch (t): 2.438, 1700.78/s, 850.392/s/gpu LR: 0.000488 Logit Scale: 21.230 Contrastive_loss: 6.1918 (6.9618) Loss: 6.1918 (6.9618)
	2025-04-29,20:12:49 \| INFO \| Train Epoch: 0 [15159296/16056320 (94%)] Data (t): 0.390 Batch (t): 2.432, 1695.14/s, 847.571/s/gpu LR: 0.000487 Logit Scale: 21.468 Contrastive_loss: 6.1201 (6.9397) Loss: 6.1201 (6.9397)
	2025-04-29,20:16:52 \| INFO \| Train Epoch: 0 [15568896/16056320 (97%)] Data (t): 0.390 Batch (t): 2.432, 1704.81/s, 852.404/s/gpu LR: 0.000486 Logit Scale: 21.638 Contrastive_loss: 6.0887 (6.9179) Loss: 6.0887 (6.9179)