added a couple of models and their checkpoints

3e7991a over 1 year ago

29.7 kB

	2024-09-07,04:53:20 \| INFO \| No latest resume checkpoint found in /home/breaking_0.5_trained/50_most_difficult/checkpoints.
	2024-09-07,04:53:22 \| INFO \| Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
	2024-09-07,04:53:22 \| INFO \| Loaded ViT-B-32 model config.
	2024-09-07,04:53:23 \| INFO \| Model:
	2024-09-07,04:53:23 \| INFO \| CLIP(
	(visual): VisionTransformer(
	(patchnorm_pre_ln): Identity()
	(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
	(patch_dropout): Identity()
	(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(token_embedding): Embedding(49408, 512)
	(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	)
	2024-09-07,04:53:23 \| INFO \| Params:
	2024-09-07,04:53:23 \| INFO \| accum_freq: 1
	2024-09-07,04:53:23 \| INFO \| aug_cfg: {}
	2024-09-07,04:53:23 \| INFO \| batch_size: 2048
	2024-09-07,04:53:23 \| INFO \| beta1: 0.9
	2024-09-07,04:53:23 \| INFO \| beta2: 0.98
	2024-09-07,04:53:23 \| INFO \| checkpoint_path: /home/breaking_0.5_trained/50_most_difficult/checkpoints
	2024-09-07,04:53:23 \| INFO \| coca_caption_loss_weight: 2.0
	2024-09-07,04:53:23 \| INFO \| coca_contrastive_loss_weight: 1.0
	2024-09-07,04:53:23 \| INFO \| copy_codebase: False
	2024-09-07,04:53:23 \| INFO \| csv_caption_key: title
	2024-09-07,04:53:23 \| INFO \| csv_img_key: filepath
	2024-09-07,04:53:23 \| INFO \| csv_separator:
	2024-09-07,04:53:23 \| INFO \| dataset_resampled: True
	2024-09-07,04:53:23 \| INFO \| dataset_type: webdataset
	2024-09-07,04:53:23 \| INFO \| ddp_static_graph: True
	2024-09-07,04:53:23 \| INFO \| debug: False
	2024-09-07,04:53:23 \| INFO \| delete_previous_checkpoint: False
	2024-09-07,04:53:23 \| INFO \| device: cuda:0
	2024-09-07,04:53:23 \| INFO \| dist_backend: nccl
	2024-09-07,04:53:23 \| INFO \| dist_url: env://
	2024-09-07,04:53:23 \| INFO \| distill: False
	2024-09-07,04:53:23 \| INFO \| distill_model: None
	2024-09-07,04:53:23 \| INFO \| distill_pretrained: None
	2024-09-07,04:53:23 \| INFO \| distributed: True
	2024-09-07,04:53:23 \| INFO \| epochs: 5
	2024-09-07,04:53:23 \| INFO \| epochs_cooldown: None
	2024-09-07,04:53:23 \| INFO \| eps: 1e-06
	2024-09-07,04:53:23 \| INFO \| force_custom_text: False
	2024-09-07,04:53:23 \| INFO \| force_image_size: None
	2024-09-07,04:53:23 \| INFO \| force_patch_dropout: None
	2024-09-07,04:53:23 \| INFO \| force_quick_gelu: False
	2024-09-07,04:53:23 \| INFO \| gather_with_grad: True
	2024-09-07,04:53:23 \| INFO \| grad_checkpointing: True
	2024-09-07,04:53:23 \| INFO \| grad_clip_norm: None
	2024-09-07,04:53:23 \| INFO \| horovod: False
	2024-09-07,04:53:23 \| INFO \| image_mean: None
	2024-09-07,04:53:23 \| INFO \| image_std: None
	2024-09-07,04:53:23 \| INFO \| imagenet_v2: None
	2024-09-07,04:53:23 \| INFO \| imagenet_val: None
	2024-09-07,04:53:23 \| INFO \| local_loss: True
	2024-09-07,04:53:23 \| INFO \| local_rank: 0
	2024-09-07,04:53:23 \| INFO \| lock_image: False
	2024-09-07,04:53:23 \| INFO \| lock_image_freeze_bn_stats: False
	2024-09-07,04:53:23 \| INFO \| lock_image_unlocked_groups: 0
	2024-09-07,04:53:23 \| INFO \| lock_text: False
	2024-09-07,04:53:23 \| INFO \| lock_text_freeze_layer_norm: False
	2024-09-07,04:53:23 \| INFO \| lock_text_unlocked_layers: 0
	2024-09-07,04:53:23 \| INFO \| log_every_n_steps: 100
	2024-09-07,04:53:23 \| INFO \| log_level: 20
	2024-09-07,04:53:23 \| INFO \| log_local: False
	2024-09-07,04:53:23 \| INFO \| log_path: /home/breaking_0.5_trained/50_most_difficult/out.log
	2024-09-07,04:53:23 \| INFO \| logs: /home/breaking_0.5_trained
	2024-09-07,04:53:23 \| INFO \| lr: 0.0005
	2024-09-07,04:53:23 \| INFO \| lr_cooldown_end: 0.0
	2024-09-07,04:53:23 \| INFO \| lr_cooldown_power: 1.0
	2024-09-07,04:53:23 \| INFO \| lr_scheduler: cosine
	2024-09-07,04:53:23 \| INFO \| model: ViT-B-32
	2024-09-07,04:53:23 \| INFO \| name: 50_most_difficult
	2024-09-07,04:53:23 \| INFO \| no_set_device_rank: False
	2024-09-07,04:53:23 \| INFO \| precision: amp
	2024-09-07,04:53:23 \| INFO \| pretrained:
	2024-09-07,04:53:23 \| INFO \| pretrained_image: False
	2024-09-07,04:53:23 \| INFO \| rank: 0
	2024-09-07,04:53:23 \| INFO \| remote_sync: None
	2024-09-07,04:53:23 \| INFO \| remote_sync_frequency: 300
	2024-09-07,04:53:23 \| INFO \| remote_sync_protocol: s3
	2024-09-07,04:53:23 \| INFO \| report_to: wandb
	2024-09-07,04:53:23 \| INFO \| resume: None
	2024-09-07,04:53:23 \| INFO \| save_frequency: 0
	2024-09-07,04:53:23 \| INFO \| save_most_recent: True
	2024-09-07,04:53:23 \| INFO \| seed: 0
	2024-09-07,04:53:23 \| INFO \| skip_scheduler: False
	2024-09-07,04:53:23 \| INFO \| tensorboard: False
	2024-09-07,04:53:23 \| INFO \| tensorboard_path:
	2024-09-07,04:53:23 \| INFO \| torchscript: False
	2024-09-07,04:53:23 \| INFO \| trace: False
	2024-09-07,04:53:23 \| INFO \| train_data: /home/breaking_0.5/{00000000..00000531}.tar
	2024-09-07,04:53:23 \| INFO \| train_data_upsampling_factors: None
	2024-09-07,04:53:23 \| INFO \| train_num_samples: 2560000
	2024-09-07,04:53:23 \| INFO \| use_bn_sync: False
	2024-09-07,04:53:23 \| INFO \| val_data: None
	2024-09-07,04:53:23 \| INFO \| val_frequency: 1
	2024-09-07,04:53:23 \| INFO \| val_num_samples: None
	2024-09-07,04:53:23 \| INFO \| wandb: True
	2024-09-07,04:53:23 \| INFO \| wandb_notes:
	2024-09-07,04:53:23 \| INFO \| wandb_project_name: clip_text_hq_clusters
	2024-09-07,04:53:23 \| INFO \| warmup: 500
	2024-09-07,04:53:23 \| INFO \| wd: 0.2
	2024-09-07,04:53:23 \| INFO \| workers: 4
	2024-09-07,04:53:23 \| INFO \| world_size: 2
	2024-09-07,04:53:23 \| INFO \| zeroshot_frequency: 2
	2024-09-07,04:53:30 \| INFO \| Start epoch 0
	2024-09-07,04:53:47 \| INFO \| Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 12.292 Batch (t): 17.044, 240.320/s, 120.160/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3793 (8.3793) Loss: 8.3793 (8.3793)
	2024-09-07,04:53:50 \| INFO \| Reducer buckets have been rebuilt in this iteration.
	2024-09-07,04:58:08 \| INFO \| Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.555 Batch (t): 2.608, 1570.24/s, 785.122/s/gpu LR: 0.000101 Logit Scale: 14.263 Contrastive_loss: 8.2098 (8.2945) Loss: 8.2098 (8.2945)
	2024-09-07,05:02:30 \| INFO \| Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.566 Batch (t): 2.619, 1567.59/s, 783.795/s/gpu LR: 0.000201 Logit Scale: 14.236 Contrastive_loss: 8.1342 (8.2411) Loss: 8.1342 (8.2411)
	2024-09-07,05:06:52 \| INFO \| Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.621, 1561.56/s, 780.780/s/gpu LR: 0.000301 Logit Scale: 14.197 Contrastive_loss: 7.9712 (8.1736) Loss: 7.9712 (8.1736)
	2024-09-07,05:11:14 \| INFO \| Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.572 Batch (t): 2.623, 1564.54/s, 782.270/s/gpu LR: 0.000401 Logit Scale: 14.159 Contrastive_loss: 7.8644 (8.1118) Loss: 7.8644 (8.1118)
	2024-09-07,05:15:36 \| INFO \| Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.569 Batch (t): 2.619, 1566.43/s, 783.215/s/gpu LR: 0.000500 Logit Scale: 14.119 Contrastive_loss: 7.8428 (8.0670) Loss: 7.8428 (8.0670)
	2024-09-07,05:19:58 \| INFO \| Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.568 Batch (t): 2.620, 1557.15/s, 778.573/s/gpu LR: 0.000498 Logit Scale: 14.100 Contrastive_loss: 7.7296 (8.0188) Loss: 7.7296 (8.0188)
	2024-09-07,05:21:08 \| INFO \| Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.565 Batch (t): 2.617, 1576.44/s, 788.218/s/gpu LR: 0.000497 Logit Scale: 14.099 Contrastive_loss: 7.7862 (7.9897) Loss: 7.7862 (7.9897)
	2024-09-07,05:21:11 \| INFO \| Start epoch 1
	2024-09-07,05:21:23 \| INFO \| Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.800 Batch (t): 11.848, 345.708/s, 172.854/s/gpu LR: 0.000497 Logit Scale: 14.099 Contrastive_loss: 7.7746 (7.7746) Loss: 7.7746 (7.7746)
	2024-09-07,05:25:43 \| INFO \| Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.607, 1561.69/s, 780.843/s/gpu LR: 0.000491 Logit Scale: 14.097 Contrastive_loss: 7.6379 (7.7063) Loss: 7.6379 (7.7063)
	2024-09-07,05:30:05 \| INFO \| Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.566 Batch (t): 2.617, 1566.22/s, 783.108/s/gpu LR: 0.000481 Logit Scale: 14.123 Contrastive_loss: 7.5494 (7.6540) Loss: 7.5494 (7.6540)
	2024-09-07,05:34:27 \| INFO \| Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.564 Batch (t): 2.618, 1569.73/s, 784.866/s/gpu LR: 0.000468 Logit Scale: 14.166 Contrastive_loss: 7.4297 (7.5979) Loss: 7.4297 (7.5979)
	2024-09-07,05:38:49 \| INFO \| Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.616, 1566.22/s, 783.112/s/gpu LR: 0.000452 Logit Scale: 14.227 Contrastive_loss: 7.2636 (7.5310) Loss: 7.2636 (7.5310)
	2024-09-07,05:43:10 \| INFO \| Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.617, 1560.62/s, 780.309/s/gpu LR: 0.000433 Logit Scale: 14.298 Contrastive_loss: 7.1316 (7.4645) Loss: 7.1316 (7.4645)
	2024-09-07,05:47:32 \| INFO \| Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.571 Batch (t): 2.622, 1567.17/s, 783.585/s/gpu LR: 0.000412 Logit Scale: 14.398 Contrastive_loss: 7.2612 (7.4354) Loss: 7.2612 (7.4354)
	2024-09-07,05:48:43 \| INFO \| Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.560 Batch (t): 2.611, 1575.74/s, 787.872/s/gpu LR: 0.000406 Logit Scale: 14.425 Contrastive_loss: 7.1283 (7.3970) Loss: 7.1283 (7.3970)
	2024-09-07,05:48:46 \| INFO \| Start epoch 2
	2024-09-07,05:48:57 \| INFO \| Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.681 Batch (t): 11.731, 349.173/s, 174.586/s/gpu LR: 0.000405 Logit Scale: 14.426 Contrastive_loss: 7.0068 (7.0068) Loss: 7.0068 (7.0068)
	2024-09-07,05:53:18 \| INFO \| Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.607, 1567.72/s, 783.858/s/gpu LR: 0.000381 Logit Scale: 14.555 Contrastive_loss: 7.1138 (7.0603) Loss: 7.1138 (7.0603)
	2024-09-07,05:57:40 \| INFO \| Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.618, 1552.55/s, 776.277/s/gpu LR: 0.000355 Logit Scale: 14.689 Contrastive_loss: 6.8805 (7.0004) Loss: 6.8805 (7.0004)
	2024-09-07,06:02:02 \| INFO \| Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.564 Batch (t): 2.616, 1568.97/s, 784.485/s/gpu LR: 0.000327 Logit Scale: 14.818 Contrastive_loss: 6.9158 (6.9792) Loss: 6.9158 (6.9792)
	2024-09-07,06:06:23 \| INFO \| Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.618, 1568.84/s, 784.420/s/gpu LR: 0.000298 Logit Scale: 14.968 Contrastive_loss: 6.8714 (6.9577) Loss: 6.8714 (6.9577)
	2024-09-07,06:10:45 \| INFO \| Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.566 Batch (t): 2.617, 1560.16/s, 780.081/s/gpu LR: 0.000269 Logit Scale: 15.097 Contrastive_loss: 6.7470 (6.9226) Loss: 6.7470 (6.9226)
	2024-09-07,06:15:06 \| INFO \| Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.563 Batch (t): 2.615, 1572.55/s, 786.275/s/gpu LR: 0.000239 Logit Scale: 15.244 Contrastive_loss: 6.7248 (6.8943) Loss: 6.7248 (6.8943)
	2024-09-07,06:16:17 \| INFO \| Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.557 Batch (t): 2.609, 1580.67/s, 790.337/s/gpu LR: 0.000231 Logit Scale: 15.290 Contrastive_loss: 6.6894 (6.8687) Loss: 6.6894 (6.8687)
	2024-09-07,06:16:20 \| INFO \| Start epoch 3
	2024-09-07,06:16:31 \| INFO \| Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.568 Batch (t): 11.615, 352.634/s, 176.317/s/gpu LR: 0.000231 Logit Scale: 15.292 Contrastive_loss: 6.5913 (6.5913) Loss: 6.5913 (6.5913)
	2024-09-07,06:20:52 \| INFO \| Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.551 Batch (t): 2.609, 1564.82/s, 782.409/s/gpu LR: 0.000202 Logit Scale: 15.435 Contrastive_loss: 6.6123 (6.6018) Loss: 6.6123 (6.6018)
	2024-09-07,06:25:14 \| INFO \| Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.559 Batch (t): 2.612, 1570.74/s, 785.370/s/gpu LR: 0.000173 Logit Scale: 15.541 Contrastive_loss: 6.5566 (6.5867) Loss: 6.5566 (6.5867)
	2024-09-07,06:29:35 \| INFO \| Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.561 Batch (t): 2.614, 1565.41/s, 782.705/s/gpu LR: 0.000145 Logit Scale: 15.648 Contrastive_loss: 6.1246 (6.4712) Loss: 6.1246 (6.4712)
	2024-09-07,06:33:57 \| INFO \| Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.618, 1561.09/s, 780.547/s/gpu LR: 0.000119 Logit Scale: 15.762 Contrastive_loss: 6.1003 (6.3970) Loss: 6.1003 (6.3970)
	2024-09-07,06:38:19 \| INFO \| Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.563 Batch (t): 2.618, 1567.95/s, 783.976/s/gpu LR: 0.000095 Logit Scale: 15.849 Contrastive_loss: 6.1111 (6.3494) Loss: 6.1111 (6.3494)
	2024-09-07,06:42:40 \| INFO \| Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.564 Batch (t): 2.617, 1561.15/s, 780.577/s/gpu LR: 0.000072 Logit Scale: 15.918 Contrastive_loss: 6.1647 (6.3230) Loss: 6.1647 (6.3230)
	2024-09-07,06:43:51 \| INFO \| Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.564 Batch (t): 2.617, 1572.59/s, 786.296/s/gpu LR: 0.000067 Logit Scale: 15.938 Contrastive_loss: 6.2637 (6.3156) Loss: 6.2637 (6.3156)
	2024-09-07,06:43:54 \| INFO \| Start epoch 4
	2024-09-07,06:44:05 \| INFO \| Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.638 Batch (t): 11.688, 350.454/s, 175.227/s/gpu LR: 0.000067 Logit Scale: 15.939 Contrastive_loss: 6.2286 (6.2286) Loss: 6.2286 (6.2286)
	2024-09-07,06:48:26 \| INFO \| Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.610, 1567.01/s, 783.507/s/gpu LR: 0.000048 Logit Scale: 15.991 Contrastive_loss: 6.1032 (6.1659) Loss: 6.1032 (6.1659)
	2024-09-07,06:52:48 \| INFO \| Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.618, 1562.44/s, 781.222/s/gpu LR: 0.000032 Logit Scale: 16.023 Contrastive_loss: 6.1207 (6.1508) Loss: 6.1207 (6.1508)
	2024-09-07,06:57:10 \| INFO \| Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.562 Batch (t): 2.616, 1566.88/s, 783.440/s/gpu LR: 0.000019 Logit Scale: 16.048 Contrastive_loss: 5.5887 (6.0103) Loss: 5.5887 (6.0103)
	2024-09-07,07:01:31 \| INFO \| Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.617, 1570.10/s, 785.052/s/gpu LR: 0.000009 Logit Scale: 16.061 Contrastive_loss: 5.9363 (5.9955) Loss: 5.9363 (5.9955)
	2024-09-07,07:05:53 \| INFO \| Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.558 Batch (t): 2.611, 1569.54/s, 784.770/s/gpu LR: 0.000003 Logit Scale: 16.067 Contrastive_loss: 5.9174 (5.9825) Loss: 5.9174 (5.9825)
	2024-09-07,07:10:14 \| INFO \| Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.559 Batch (t): 2.611, 1568.58/s, 784.289/s/gpu LR: 0.000000 Logit Scale: 16.068 Contrastive_loss: 6.1942 (6.0127) Loss: 6.1942 (6.0127)
	2024-09-07,07:11:24 \| INFO \| Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.549 Batch (t): 2.604, 1583.04/s, 791.520/s/gpu LR: 0.000000 Logit Scale: 16.068 Contrastive_loss: 5.7131 (5.9753) Loss: 5.7131 (5.9753)