added more trained models

cf6f528 over 1 year ago

29.7 kB

	2024-09-07,07:15:56 \| INFO \| No latest resume checkpoint found in /home/breaking_0.3_trained/30_most_difficult/checkpoints.
	2024-09-07,07:15:57 \| INFO \| Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
	2024-09-07,07:15:57 \| INFO \| Loaded ViT-B-32 model config.
	2024-09-07,07:15:58 \| INFO \| Model:
	2024-09-07,07:15:58 \| INFO \| CLIP(
	(visual): VisionTransformer(
	(patchnorm_pre_ln): Identity()
	(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
	(patch_dropout): Identity()
	(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(token_embedding): Embedding(49408, 512)
	(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	)
	2024-09-07,07:15:58 \| INFO \| Params:
	2024-09-07,07:15:58 \| INFO \| accum_freq: 1
	2024-09-07,07:15:58 \| INFO \| aug_cfg: {}
	2024-09-07,07:15:58 \| INFO \| batch_size: 2048
	2024-09-07,07:15:58 \| INFO \| beta1: 0.9
	2024-09-07,07:15:58 \| INFO \| beta2: 0.98
	2024-09-07,07:15:58 \| INFO \| checkpoint_path: /home/breaking_0.3_trained/30_most_difficult/checkpoints
	2024-09-07,07:15:58 \| INFO \| coca_caption_loss_weight: 2.0
	2024-09-07,07:15:58 \| INFO \| coca_contrastive_loss_weight: 1.0
	2024-09-07,07:15:58 \| INFO \| copy_codebase: False
	2024-09-07,07:15:58 \| INFO \| csv_caption_key: title
	2024-09-07,07:15:58 \| INFO \| csv_img_key: filepath
	2024-09-07,07:15:58 \| INFO \| csv_separator:
	2024-09-07,07:15:58 \| INFO \| dataset_resampled: True
	2024-09-07,07:15:58 \| INFO \| dataset_type: webdataset
	2024-09-07,07:15:58 \| INFO \| ddp_static_graph: True
	2024-09-07,07:15:58 \| INFO \| debug: False
	2024-09-07,07:15:58 \| INFO \| delete_previous_checkpoint: False
	2024-09-07,07:15:58 \| INFO \| device: cuda:0
	2024-09-07,07:15:58 \| INFO \| dist_backend: nccl
	2024-09-07,07:15:58 \| INFO \| dist_url: env://
	2024-09-07,07:15:58 \| INFO \| distill: False
	2024-09-07,07:15:58 \| INFO \| distill_model: None
	2024-09-07,07:15:58 \| INFO \| distill_pretrained: None
	2024-09-07,07:15:58 \| INFO \| distributed: True
	2024-09-07,07:15:58 \| INFO \| epochs: 5
	2024-09-07,07:15:58 \| INFO \| epochs_cooldown: None
	2024-09-07,07:15:58 \| INFO \| eps: 1e-06
	2024-09-07,07:15:58 \| INFO \| force_custom_text: False
	2024-09-07,07:15:58 \| INFO \| force_image_size: None
	2024-09-07,07:15:58 \| INFO \| force_patch_dropout: None
	2024-09-07,07:15:58 \| INFO \| force_quick_gelu: False
	2024-09-07,07:15:58 \| INFO \| gather_with_grad: True
	2024-09-07,07:15:58 \| INFO \| grad_checkpointing: True
	2024-09-07,07:15:58 \| INFO \| grad_clip_norm: None
	2024-09-07,07:15:58 \| INFO \| horovod: False
	2024-09-07,07:15:58 \| INFO \| image_mean: None
	2024-09-07,07:15:58 \| INFO \| image_std: None
	2024-09-07,07:15:58 \| INFO \| imagenet_v2: None
	2024-09-07,07:15:58 \| INFO \| imagenet_val: None
	2024-09-07,07:15:58 \| INFO \| local_loss: True
	2024-09-07,07:15:58 \| INFO \| local_rank: 0
	2024-09-07,07:15:58 \| INFO \| lock_image: False
	2024-09-07,07:15:58 \| INFO \| lock_image_freeze_bn_stats: False
	2024-09-07,07:15:58 \| INFO \| lock_image_unlocked_groups: 0
	2024-09-07,07:15:58 \| INFO \| lock_text: False
	2024-09-07,07:15:58 \| INFO \| lock_text_freeze_layer_norm: False
	2024-09-07,07:15:58 \| INFO \| lock_text_unlocked_layers: 0
	2024-09-07,07:15:58 \| INFO \| log_every_n_steps: 100
	2024-09-07,07:15:58 \| INFO \| log_level: 20
	2024-09-07,07:15:58 \| INFO \| log_local: False
	2024-09-07,07:15:58 \| INFO \| log_path: /home/breaking_0.3_trained/30_most_difficult/out.log
	2024-09-07,07:15:58 \| INFO \| logs: /home/breaking_0.3_trained
	2024-09-07,07:15:58 \| INFO \| lr: 0.0005
	2024-09-07,07:15:58 \| INFO \| lr_cooldown_end: 0.0
	2024-09-07,07:15:58 \| INFO \| lr_cooldown_power: 1.0
	2024-09-07,07:15:58 \| INFO \| lr_scheduler: cosine
	2024-09-07,07:15:58 \| INFO \| model: ViT-B-32
	2024-09-07,07:15:58 \| INFO \| name: 30_most_difficult
	2024-09-07,07:15:58 \| INFO \| no_set_device_rank: False
	2024-09-07,07:15:58 \| INFO \| precision: amp
	2024-09-07,07:15:58 \| INFO \| pretrained:
	2024-09-07,07:15:58 \| INFO \| pretrained_image: False
	2024-09-07,07:15:58 \| INFO \| rank: 0
	2024-09-07,07:15:58 \| INFO \| remote_sync: None
	2024-09-07,07:15:58 \| INFO \| remote_sync_frequency: 300
	2024-09-07,07:15:58 \| INFO \| remote_sync_protocol: s3
	2024-09-07,07:15:58 \| INFO \| report_to: wandb
	2024-09-07,07:15:58 \| INFO \| resume: None
	2024-09-07,07:15:58 \| INFO \| save_frequency: 0
	2024-09-07,07:15:58 \| INFO \| save_most_recent: True
	2024-09-07,07:15:58 \| INFO \| seed: 0
	2024-09-07,07:15:58 \| INFO \| skip_scheduler: False
	2024-09-07,07:15:58 \| INFO \| tensorboard: False
	2024-09-07,07:15:58 \| INFO \| tensorboard_path:
	2024-09-07,07:15:58 \| INFO \| torchscript: False
	2024-09-07,07:15:58 \| INFO \| trace: False
	2024-09-07,07:15:58 \| INFO \| train_data: /home/breaking_0.3/{00000000..00000335}.tar
	2024-09-07,07:15:58 \| INFO \| train_data_upsampling_factors: None
	2024-09-07,07:15:58 \| INFO \| train_num_samples: 2560000
	2024-09-07,07:15:58 \| INFO \| use_bn_sync: False
	2024-09-07,07:15:58 \| INFO \| val_data: None
	2024-09-07,07:15:58 \| INFO \| val_frequency: 1
	2024-09-07,07:15:58 \| INFO \| val_num_samples: None
	2024-09-07,07:15:58 \| INFO \| wandb: True
	2024-09-07,07:15:58 \| INFO \| wandb_notes:
	2024-09-07,07:15:58 \| INFO \| wandb_project_name: clip_text_hq_clusters
	2024-09-07,07:15:58 \| INFO \| warmup: 500
	2024-09-07,07:15:58 \| INFO \| wd: 0.2
	2024-09-07,07:15:58 \| INFO \| workers: 4
	2024-09-07,07:15:58 \| INFO \| world_size: 2
	2024-09-07,07:15:58 \| INFO \| zeroshot_frequency: 2
	2024-09-07,07:16:05 \| INFO \| Start epoch 0
	2024-09-07,07:16:21 \| INFO \| Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 11.587 Batch (t): 16.138, 253.817/s, 126.909/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3770 (8.3770) Loss: 8.3770 (8.3770)
	2024-09-07,07:16:24 \| INFO \| Reducer buckets have been rebuilt in this iteration.
	2024-09-07,07:20:41 \| INFO \| Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.546 Batch (t): 2.597, 1566.76/s, 783.379/s/gpu LR: 0.000101 Logit Scale: 14.267 Contrastive_loss: 8.2295 (8.3032) Loss: 8.2295 (8.3032)
	2024-09-07,07:25:02 \| INFO \| Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.611, 1568.76/s, 784.381/s/gpu LR: 0.000201 Logit Scale: 14.229 Contrastive_loss: 8.0991 (8.2352) Loss: 8.0991 (8.2352)
	2024-09-07,07:29:23 \| INFO \| Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.615, 1561.64/s, 780.819/s/gpu LR: 0.000301 Logit Scale: 14.203 Contrastive_loss: 8.0503 (8.1890) Loss: 8.0503 (8.1890)
	2024-09-07,07:33:45 \| INFO \| Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.570 Batch (t): 2.618, 1564.16/s, 782.079/s/gpu LR: 0.000401 Logit Scale: 14.176 Contrastive_loss: 7.9354 (8.1382) Loss: 7.9354 (8.1382)
	2024-09-07,07:38:07 \| INFO \| Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.568 Batch (t): 2.615, 1569.69/s, 784.843/s/gpu LR: 0.000500 Logit Scale: 14.147 Contrastive_loss: 7.8547 (8.0910) Loss: 7.8547 (8.0910)
	2024-09-07,07:42:28 \| INFO \| Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.567 Batch (t): 2.616, 1570.76/s, 785.381/s/gpu LR: 0.000498 Logit Scale: 14.128 Contrastive_loss: 7.7547 (8.0430) Loss: 7.7547 (8.0430)
	2024-09-07,07:43:39 \| INFO \| Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.565 Batch (t): 2.613, 1578.30/s, 789.150/s/gpu LR: 0.000497 Logit Scale: 14.121 Contrastive_loss: 7.7028 (8.0004) Loss: 7.7028 (8.0004)
	2024-09-07,07:43:41 \| INFO \| Start epoch 1
	2024-09-07,07:43:53 \| INFO \| Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.696 Batch (t): 11.740, 348.903/s, 174.452/s/gpu LR: 0.000497 Logit Scale: 14.122 Contrastive_loss: 7.7750 (7.7750) Loss: 7.7750 (7.7750)
	2024-09-07,07:48:12 \| INFO \| Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.538 Batch (t): 2.594, 1571.17/s, 785.586/s/gpu LR: 0.000491 Logit Scale: 14.130 Contrastive_loss: 7.5770 (7.6760) Loss: 7.5770 (7.6760)
	2024-09-07,07:52:34 \| INFO \| Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.613, 1567.68/s, 783.840/s/gpu LR: 0.000481 Logit Scale: 14.150 Contrastive_loss: 7.4687 (7.6069) Loss: 7.4687 (7.6069)
	2024-09-07,07:56:55 \| INFO \| Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.616, 1565.50/s, 782.748/s/gpu LR: 0.000468 Logit Scale: 14.191 Contrastive_loss: 7.3471 (7.5420) Loss: 7.3471 (7.5420)
	2024-09-07,08:01:17 \| INFO \| Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.615, 1568.25/s, 784.125/s/gpu LR: 0.000452 Logit Scale: 14.263 Contrastive_loss: 7.2163 (7.4768) Loss: 7.2163 (7.4768)
	2024-09-07,08:05:39 \| INFO \| Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.570 Batch (t): 2.618, 1569.50/s, 784.748/s/gpu LR: 0.000433 Logit Scale: 14.350 Contrastive_loss: 7.1304 (7.4191) Loss: 7.1304 (7.4191)
	2024-09-07,08:10:00 \| INFO \| Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.568 Batch (t): 2.617, 1565.34/s, 782.672/s/gpu LR: 0.000412 Logit Scale: 14.430 Contrastive_loss: 7.1217 (7.3766) Loss: 7.1217 (7.3766)
	2024-09-07,08:11:11 \| INFO \| Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.565 Batch (t): 2.614, 1562.51/s, 781.255/s/gpu LR: 0.000406 Logit Scale: 14.452 Contrastive_loss: 7.2079 (7.3555) Loss: 7.2079 (7.3555)
	2024-09-07,08:11:14 \| INFO \| Start epoch 2
	2024-09-07,08:11:25 \| INFO \| Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.554 Batch (t): 11.600, 353.101/s, 176.550/s/gpu LR: 0.000405 Logit Scale: 14.452 Contrastive_loss: 6.6379 (6.6379) Loss: 6.6379 (6.6379)
	2024-09-07,08:15:46 \| INFO \| Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.555 Batch (t): 2.611, 1572.32/s, 786.161/s/gpu LR: 0.000381 Logit Scale: 14.566 Contrastive_loss: 7.0270 (6.8325) Loss: 7.0270 (6.8325)
	2024-09-07,08:20:08 \| INFO \| Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.615, 1554.74/s, 777.370/s/gpu LR: 0.000355 Logit Scale: 14.681 Contrastive_loss: 7.1672 (6.9440) Loss: 7.1672 (6.9440)
	2024-09-07,08:24:29 \| INFO \| Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.566 Batch (t): 2.615, 1570.16/s, 785.078/s/gpu LR: 0.000327 Logit Scale: 14.790 Contrastive_loss: 6.8110 (6.9108) Loss: 6.8110 (6.9108)
	2024-09-07,08:28:51 \| INFO \| Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.614, 1569.48/s, 784.739/s/gpu LR: 0.000298 Logit Scale: 14.901 Contrastive_loss: 6.6025 (6.8491) Loss: 6.6025 (6.8491)
	2024-09-07,08:33:12 \| INFO \| Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.566 Batch (t): 2.615, 1565.61/s, 782.805/s/gpu LR: 0.000269 Logit Scale: 15.018 Contrastive_loss: 6.7172 (6.8271) Loss: 6.7172 (6.8271)
	2024-09-07,08:37:34 \| INFO \| Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.565 Batch (t): 2.615, 1566.20/s, 783.102/s/gpu LR: 0.000239 Logit Scale: 15.130 Contrastive_loss: 6.4575 (6.7743) Loss: 6.4575 (6.7743)
	2024-09-07,08:38:44 \| INFO \| Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.612, 1576.61/s, 788.306/s/gpu LR: 0.000231 Logit Scale: 15.173 Contrastive_loss: 6.3482 (6.7211) Loss: 6.3482 (6.7211)
	2024-09-07,08:38:47 \| INFO \| Start epoch 3
	2024-09-07,08:38:59 \| INFO \| Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.551 Batch (t): 11.600, 353.118/s, 176.559/s/gpu LR: 0.000231 Logit Scale: 15.175 Contrastive_loss: 6.4882 (6.4882) Loss: 6.4882 (6.4882)
	2024-09-07,08:43:19 \| INFO \| Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.543 Batch (t): 2.601, 1571.86/s, 785.932/s/gpu LR: 0.000202 Logit Scale: 15.290 Contrastive_loss: 6.4287 (6.4584) Loss: 6.4287 (6.4584)
	2024-09-07,08:47:40 \| INFO \| Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.562 Batch (t): 2.613, 1568.66/s, 784.328/s/gpu LR: 0.000173 Logit Scale: 15.426 Contrastive_loss: 6.5816 (6.4995) Loss: 6.5816 (6.4995)
	2024-09-07,08:52:02 \| INFO \| Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.617, 1558.61/s, 779.304/s/gpu LR: 0.000145 Logit Scale: 15.514 Contrastive_loss: 6.4312 (6.4824) Loss: 6.4312 (6.4824)
	2024-09-07,08:56:24 \| INFO \| Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.618, 1567.58/s, 783.790/s/gpu LR: 0.000119 Logit Scale: 15.599 Contrastive_loss: 5.8314 (6.3522) Loss: 5.8314 (6.3522)
	2024-09-07,09:00:46 \| INFO \| Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.570 Batch (t): 2.621, 1558.78/s, 779.390/s/gpu LR: 0.000095 Logit Scale: 15.691 Contrastive_loss: 5.5672 (6.2214) Loss: 5.5672 (6.2214)
	2024-09-07,09:05:08 \| INFO \| Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.566 Batch (t): 2.618, 1566.44/s, 783.219/s/gpu LR: 0.000072 Logit Scale: 15.787 Contrastive_loss: 5.9069 (6.1765) Loss: 5.9069 (6.1765)
	2024-09-07,09:06:18 \| INFO \| Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.562 Batch (t): 2.614, 1575.77/s, 787.886/s/gpu LR: 0.000067 Logit Scale: 15.805 Contrastive_loss: 5.5870 (6.1028) Loss: 5.5870 (6.1028)
	2024-09-07,09:06:21 \| INFO \| Start epoch 4
	2024-09-07,09:06:33 \| INFO \| Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.714 Batch (t): 11.763, 348.210/s, 174.105/s/gpu LR: 0.000067 Logit Scale: 15.806 Contrastive_loss: 5.4202 (5.4202) Loss: 5.4202 (5.4202)
	2024-09-07,09:10:53 \| INFO \| Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.549 Batch (t): 2.607, 1561.80/s, 780.900/s/gpu LR: 0.000048 Logit Scale: 15.859 Contrastive_loss: 6.5509 (5.9855) Loss: 6.5509 (5.9855)
	2024-09-07,09:15:15 \| INFO \| Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.615, 1561.51/s, 780.753/s/gpu LR: 0.000032 Logit Scale: 15.902 Contrastive_loss: 5.7273 (5.8995) Loss: 5.7273 (5.8995)
	2024-09-07,09:19:37 \| INFO \| Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.618, 1571.58/s, 785.789/s/gpu LR: 0.000019 Logit Scale: 15.925 Contrastive_loss: 6.0029 (5.9253) Loss: 6.0029 (5.9253)
	2024-09-07,09:23:58 \| INFO \| Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.615, 1567.93/s, 783.965/s/gpu LR: 0.000009 Logit Scale: 15.935 Contrastive_loss: 5.0497 (5.7502) Loss: 5.0497 (5.7502)
	2024-09-07,09:28:20 \| INFO \| Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.616, 1563.19/s, 781.593/s/gpu LR: 0.000003 Logit Scale: 15.940 Contrastive_loss: 5.5695 (5.7201) Loss: 5.5695 (5.7201)
	2024-09-07,09:32:42 \| INFO \| Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.565 Batch (t): 2.616, 1563.88/s, 781.939/s/gpu LR: 0.000000 Logit Scale: 15.942 Contrastive_loss: 5.5382 (5.6941) Loss: 5.5382 (5.6941)
	2024-09-07,09:33:52 \| INFO \| Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.558 Batch (t): 2.610, 1579.28/s, 789.638/s/gpu LR: 0.000000 Logit Scale: 15.942 Contrastive_loss: 5.7904 (5.7061) Loss: 5.7904 (5.7061)