added more trained models

cf6f528 over 1 year ago

29.7 kB

	2024-09-07,12:35:40 \| INFO \| No latest resume checkpoint found in /home/breaking_0.9_trained/90_most_difficult/checkpoints.
	2024-09-07,12:35:41 \| INFO \| Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
	2024-09-07,12:35:41 \| INFO \| Loaded ViT-B-32 model config.
	2024-09-07,12:35:42 \| INFO \| Model:
	2024-09-07,12:35:42 \| INFO \| CLIP(
	(visual): VisionTransformer(
	(patchnorm_pre_ln): Identity()
	(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
	(patch_dropout): Identity()
	(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(token_embedding): Embedding(49408, 512)
	(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	)
	2024-09-07,12:35:42 \| INFO \| Params:
	2024-09-07,12:35:42 \| INFO \| accum_freq: 1
	2024-09-07,12:35:42 \| INFO \| aug_cfg: {}
	2024-09-07,12:35:42 \| INFO \| batch_size: 2048
	2024-09-07,12:35:42 \| INFO \| beta1: 0.9
	2024-09-07,12:35:42 \| INFO \| beta2: 0.98
	2024-09-07,12:35:42 \| INFO \| checkpoint_path: /home/breaking_0.9_trained/90_most_difficult/checkpoints
	2024-09-07,12:35:42 \| INFO \| coca_caption_loss_weight: 2.0
	2024-09-07,12:35:42 \| INFO \| coca_contrastive_loss_weight: 1.0
	2024-09-07,12:35:42 \| INFO \| copy_codebase: False
	2024-09-07,12:35:42 \| INFO \| csv_caption_key: title
	2024-09-07,12:35:42 \| INFO \| csv_img_key: filepath
	2024-09-07,12:35:42 \| INFO \| csv_separator:
	2024-09-07,12:35:42 \| INFO \| dataset_resampled: True
	2024-09-07,12:35:42 \| INFO \| dataset_type: webdataset
	2024-09-07,12:35:42 \| INFO \| ddp_static_graph: True
	2024-09-07,12:35:42 \| INFO \| debug: False
	2024-09-07,12:35:42 \| INFO \| delete_previous_checkpoint: False
	2024-09-07,12:35:42 \| INFO \| device: cuda:0
	2024-09-07,12:35:42 \| INFO \| dist_backend: nccl
	2024-09-07,12:35:42 \| INFO \| dist_url: env://
	2024-09-07,12:35:42 \| INFO \| distill: False
	2024-09-07,12:35:42 \| INFO \| distill_model: None
	2024-09-07,12:35:42 \| INFO \| distill_pretrained: None
	2024-09-07,12:35:42 \| INFO \| distributed: True
	2024-09-07,12:35:42 \| INFO \| epochs: 5
	2024-09-07,12:35:42 \| INFO \| epochs_cooldown: None
	2024-09-07,12:35:42 \| INFO \| eps: 1e-06
	2024-09-07,12:35:42 \| INFO \| force_custom_text: False
	2024-09-07,12:35:42 \| INFO \| force_image_size: None
	2024-09-07,12:35:42 \| INFO \| force_patch_dropout: None
	2024-09-07,12:35:42 \| INFO \| force_quick_gelu: False
	2024-09-07,12:35:42 \| INFO \| gather_with_grad: True
	2024-09-07,12:35:42 \| INFO \| grad_checkpointing: True
	2024-09-07,12:35:42 \| INFO \| grad_clip_norm: None
	2024-09-07,12:35:42 \| INFO \| horovod: False
	2024-09-07,12:35:42 \| INFO \| image_mean: None
	2024-09-07,12:35:42 \| INFO \| image_std: None
	2024-09-07,12:35:42 \| INFO \| imagenet_v2: None
	2024-09-07,12:35:42 \| INFO \| imagenet_val: None
	2024-09-07,12:35:42 \| INFO \| local_loss: True
	2024-09-07,12:35:42 \| INFO \| local_rank: 0
	2024-09-07,12:35:42 \| INFO \| lock_image: False
	2024-09-07,12:35:42 \| INFO \| lock_image_freeze_bn_stats: False
	2024-09-07,12:35:42 \| INFO \| lock_image_unlocked_groups: 0
	2024-09-07,12:35:42 \| INFO \| lock_text: False
	2024-09-07,12:35:42 \| INFO \| lock_text_freeze_layer_norm: False
	2024-09-07,12:35:42 \| INFO \| lock_text_unlocked_layers: 0
	2024-09-07,12:35:42 \| INFO \| log_every_n_steps: 100
	2024-09-07,12:35:42 \| INFO \| log_level: 20
	2024-09-07,12:35:42 \| INFO \| log_local: False
	2024-09-07,12:35:42 \| INFO \| log_path: /home/breaking_0.9_trained/90_most_difficult/out.log
	2024-09-07,12:35:42 \| INFO \| logs: /home/breaking_0.9_trained
	2024-09-07,12:35:42 \| INFO \| lr: 0.0005
	2024-09-07,12:35:42 \| INFO \| lr_cooldown_end: 0.0
	2024-09-07,12:35:42 \| INFO \| lr_cooldown_power: 1.0
	2024-09-07,12:35:42 \| INFO \| lr_scheduler: cosine
	2024-09-07,12:35:42 \| INFO \| model: ViT-B-32
	2024-09-07,12:35:42 \| INFO \| name: 90_most_difficult
	2024-09-07,12:35:42 \| INFO \| no_set_device_rank: False
	2024-09-07,12:35:42 \| INFO \| precision: amp
	2024-09-07,12:35:42 \| INFO \| pretrained:
	2024-09-07,12:35:42 \| INFO \| pretrained_image: False
	2024-09-07,12:35:42 \| INFO \| rank: 0
	2024-09-07,12:35:42 \| INFO \| remote_sync: None
	2024-09-07,12:35:42 \| INFO \| remote_sync_frequency: 300
	2024-09-07,12:35:42 \| INFO \| remote_sync_protocol: s3
	2024-09-07,12:35:42 \| INFO \| report_to: wandb
	2024-09-07,12:35:42 \| INFO \| resume: None
	2024-09-07,12:35:42 \| INFO \| save_frequency: 0
	2024-09-07,12:35:42 \| INFO \| save_most_recent: True
	2024-09-07,12:35:42 \| INFO \| seed: 0
	2024-09-07,12:35:42 \| INFO \| skip_scheduler: False
	2024-09-07,12:35:42 \| INFO \| tensorboard: False
	2024-09-07,12:35:42 \| INFO \| tensorboard_path:
	2024-09-07,12:35:42 \| INFO \| torchscript: False
	2024-09-07,12:35:42 \| INFO \| trace: False
	2024-09-07,12:35:42 \| INFO \| train_data: /home/breaking_0.9/{00000000..00000962}.tar
	2024-09-07,12:35:42 \| INFO \| train_data_upsampling_factors: None
	2024-09-07,12:35:42 \| INFO \| train_num_samples: 2560000
	2024-09-07,12:35:42 \| INFO \| use_bn_sync: False
	2024-09-07,12:35:42 \| INFO \| val_data: None
	2024-09-07,12:35:42 \| INFO \| val_frequency: 1
	2024-09-07,12:35:42 \| INFO \| val_num_samples: None
	2024-09-07,12:35:42 \| INFO \| wandb: True
	2024-09-07,12:35:42 \| INFO \| wandb_notes:
	2024-09-07,12:35:42 \| INFO \| wandb_project_name: clip_text_hq_clusters
	2024-09-07,12:35:42 \| INFO \| warmup: 500
	2024-09-07,12:35:42 \| INFO \| wd: 0.2
	2024-09-07,12:35:42 \| INFO \| workers: 4
	2024-09-07,12:35:42 \| INFO \| world_size: 2
	2024-09-07,12:35:42 \| INFO \| zeroshot_frequency: 2
	2024-09-07,12:35:50 \| INFO \| Start epoch 0
	2024-09-07,12:36:06 \| INFO \| Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 12.100 Batch (t): 16.653, 245.968/s, 122.984/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3763 (8.3763) Loss: 8.3763 (8.3763)
	2024-09-07,12:36:10 \| INFO \| Reducer buckets have been rebuilt in this iteration.
	2024-09-07,12:40:27 \| INFO \| Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.552 Batch (t): 2.604, 1566.14/s, 783.069/s/gpu LR: 0.000101 Logit Scale: 14.266 Contrastive_loss: 8.1784 (8.2774) Loss: 8.1784 (8.2774)
	2024-09-07,12:44:48 \| INFO \| Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.612, 1563.16/s, 781.581/s/gpu LR: 0.000201 Logit Scale: 14.228 Contrastive_loss: 7.9988 (8.1845) Loss: 7.9988 (8.1845)
	2024-09-07,12:49:09 \| INFO \| Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.563 Batch (t): 2.613, 1564.24/s, 782.121/s/gpu LR: 0.000301 Logit Scale: 14.184 Contrastive_loss: 7.9886 (8.1355) Loss: 7.9886 (8.1355)
	2024-09-07,12:53:31 \| INFO \| Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.612, 1570.66/s, 785.331/s/gpu LR: 0.000401 Logit Scale: 14.136 Contrastive_loss: 7.8946 (8.0873) Loss: 7.8946 (8.0873)
	2024-09-07,12:57:52 \| INFO \| Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.559 Batch (t): 2.609, 1567.63/s, 783.816/s/gpu LR: 0.000500 Logit Scale: 14.088 Contrastive_loss: 7.8069 (8.0406) Loss: 7.8069 (8.0406)
	2024-09-07,13:02:13 \| INFO \| Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.560 Batch (t): 2.611, 1576.40/s, 788.198/s/gpu LR: 0.000498 Logit Scale: 14.064 Contrastive_loss: 7.7242 (7.9954) Loss: 7.7242 (7.9954)
	2024-09-07,13:03:23 \| INFO \| Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.557 Batch (t): 2.607, 1578.68/s, 789.338/s/gpu LR: 0.000497 Logit Scale: 14.063 Contrastive_loss: 7.6876 (7.9569) Loss: 7.6876 (7.9569)
	2024-09-07,13:03:26 \| INFO \| Start epoch 1
	2024-09-07,13:03:37 \| INFO \| Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.636 Batch (t): 11.680, 350.674/s, 175.337/s/gpu LR: 0.000497 Logit Scale: 14.063 Contrastive_loss: 7.6917 (7.6917) Loss: 7.6917 (7.6917)
	2024-09-07,13:07:57 \| INFO \| Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.538 Batch (t): 2.597, 1573.69/s, 786.847/s/gpu LR: 0.000491 Logit Scale: 14.065 Contrastive_loss: 7.6440 (7.6679) Loss: 7.6440 (7.6679)
	2024-09-07,13:12:18 \| INFO \| Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.557 Batch (t): 2.609, 1572.22/s, 786.112/s/gpu LR: 0.000481 Logit Scale: 14.094 Contrastive_loss: 7.5110 (7.6156) Loss: 7.5110 (7.6156)
	2024-09-07,13:16:39 \| INFO \| Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.557 Batch (t): 2.609, 1571.16/s, 785.581/s/gpu LR: 0.000468 Logit Scale: 14.146 Contrastive_loss: 7.5073 (7.5885) Loss: 7.5073 (7.5885)
	2024-09-07,13:20:59 \| INFO \| Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.557 Batch (t): 2.607, 1575.14/s, 787.570/s/gpu LR: 0.000452 Logit Scale: 14.215 Contrastive_loss: 7.3952 (7.5499) Loss: 7.3952 (7.5499)
	2024-09-07,13:25:20 \| INFO \| Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.560 Batch (t): 2.610, 1553.88/s, 776.941/s/gpu LR: 0.000433 Logit Scale: 14.321 Contrastive_loss: 7.3651 (7.5191) Loss: 7.3651 (7.5191)
	2024-09-07,13:29:42 \| INFO \| Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.560 Batch (t): 2.611, 1564.99/s, 782.493/s/gpu LR: 0.000412 Logit Scale: 14.443 Contrastive_loss: 7.3117 (7.4894) Loss: 7.3117 (7.4894)
	2024-09-07,13:30:52 \| INFO \| Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.607, 1581.65/s, 790.826/s/gpu LR: 0.000406 Logit Scale: 14.479 Contrastive_loss: 7.1874 (7.4517) Loss: 7.1874 (7.4517)
	2024-09-07,13:30:55 \| INFO \| Start epoch 2
	2024-09-07,13:31:06 \| INFO \| Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.545 Batch (t): 11.591, 353.370/s, 176.685/s/gpu LR: 0.000405 Logit Scale: 14.481 Contrastive_loss: 7.2456 (7.2456) Loss: 7.2456 (7.2456)
	2024-09-07,13:35:27 \| INFO \| Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.545 Batch (t): 2.603, 1569.79/s, 784.896/s/gpu LR: 0.000381 Logit Scale: 14.642 Contrastive_loss: 7.2023 (7.2240) Loss: 7.2023 (7.2240)
	2024-09-07,13:39:47 \| INFO \| Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.559 Batch (t): 2.609, 1571.68/s, 785.841/s/gpu LR: 0.000355 Logit Scale: 14.807 Contrastive_loss: 7.0309 (7.1596) Loss: 7.0309 (7.1596)
	2024-09-07,13:44:09 \| INFO \| Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.562 Batch (t): 2.611, 1565.52/s, 782.758/s/gpu LR: 0.000327 Logit Scale: 14.927 Contrastive_loss: 7.1046 (7.1459) Loss: 7.1046 (7.1459)
	2024-09-07,13:48:30 \| INFO \| Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.614, 1569.29/s, 784.644/s/gpu LR: 0.000298 Logit Scale: 15.085 Contrastive_loss: 6.8606 (7.0888) Loss: 6.8606 (7.0888)
	2024-09-07,13:52:51 \| INFO \| Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.562 Batch (t): 2.614, 1568.08/s, 784.039/s/gpu LR: 0.000269 Logit Scale: 15.223 Contrastive_loss: 6.8216 (7.0443) Loss: 6.8216 (7.0443)
	2024-09-07,13:57:13 \| INFO \| Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.563 Batch (t): 2.613, 1567.16/s, 783.578/s/gpu LR: 0.000239 Logit Scale: 15.374 Contrastive_loss: 6.6735 (6.9913) Loss: 6.6735 (6.9913)
	2024-09-07,13:58:23 \| INFO \| Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.561 Batch (t): 2.611, 1576.59/s, 788.293/s/gpu LR: 0.000231 Logit Scale: 15.422 Contrastive_loss: 6.7912 (6.9663) Loss: 6.7912 (6.9663)
	2024-09-07,13:58:26 \| INFO \| Start epoch 3
	2024-09-07,13:58:37 \| INFO \| Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.438 Batch (t): 11.480, 356.781/s, 178.391/s/gpu LR: 0.000231 Logit Scale: 15.423 Contrastive_loss: 6.6685 (6.6685) Loss: 6.6685 (6.6685)
	2024-09-07,14:02:58 \| INFO \| Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.547 Batch (t): 2.606, 1563.69/s, 781.844/s/gpu LR: 0.000202 Logit Scale: 15.573 Contrastive_loss: 6.7789 (6.7237) Loss: 6.7789 (6.7237)
	2024-09-07,14:07:19 \| INFO \| Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.560 Batch (t): 2.613, 1571.76/s, 785.878/s/gpu LR: 0.000173 Logit Scale: 15.734 Contrastive_loss: 6.6477 (6.6984) Loss: 6.6477 (6.6984)
	2024-09-07,14:11:41 \| INFO \| Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.561 Batch (t): 2.613, 1567.31/s, 783.654/s/gpu LR: 0.000145 Logit Scale: 15.861 Contrastive_loss: 6.5687 (6.6660) Loss: 6.5687 (6.6660)
	2024-09-07,14:16:02 \| INFO \| Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.614, 1571.02/s, 785.509/s/gpu LR: 0.000119 Logit Scale: 15.976 Contrastive_loss: 6.6244 (6.6576) Loss: 6.6244 (6.6576)
	2024-09-07,14:20:23 \| INFO \| Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.562 Batch (t): 2.614, 1569.91/s, 784.953/s/gpu LR: 0.000095 Logit Scale: 16.078 Contrastive_loss: 6.3511 (6.6066) Loss: 6.3511 (6.6066)
	2024-09-07,14:24:45 \| INFO \| Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.565 Batch (t): 2.617, 1565.69/s, 782.846/s/gpu LR: 0.000072 Logit Scale: 16.172 Contrastive_loss: 6.3930 (6.5761) Loss: 6.3930 (6.5761)
	2024-09-07,14:25:56 \| INFO \| Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.610, 1579.64/s, 789.822/s/gpu LR: 0.000067 Logit Scale: 16.193 Contrastive_loss: 6.6402 (6.5841) Loss: 6.6402 (6.5841)
	2024-09-07,14:25:58 \| INFO \| Start epoch 4
	2024-09-07,14:26:10 \| INFO \| Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.504 Batch (t): 11.549, 354.649/s, 177.325/s/gpu LR: 0.000067 Logit Scale: 16.193 Contrastive_loss: 6.5566 (6.5566) Loss: 6.5566 (6.5566)
	2024-09-07,14:30:31 \| INFO \| Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.546 Batch (t): 2.607, 1566.76/s, 783.382/s/gpu LR: 0.000048 Logit Scale: 16.260 Contrastive_loss: 6.4124 (6.4845) Loss: 6.4124 (6.4845)
	2024-09-07,14:34:52 \| INFO \| Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.563 Batch (t): 2.615, 1552.31/s, 776.154/s/gpu LR: 0.000032 Logit Scale: 16.300 Contrastive_loss: 6.3687 (6.4459) Loss: 6.3687 (6.4459)
	2024-09-07,14:39:14 \| INFO \| Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.563 Batch (t): 2.617, 1567.87/s, 783.936/s/gpu LR: 0.000019 Logit Scale: 16.329 Contrastive_loss: 6.3193 (6.4142) Loss: 6.3193 (6.4142)
	2024-09-07,14:43:35 \| INFO \| Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.561 Batch (t): 2.613, 1567.16/s, 783.580/s/gpu LR: 0.000009 Logit Scale: 16.343 Contrastive_loss: 6.3362 (6.3986) Loss: 6.3362 (6.3986)
	2024-09-07,14:47:57 \| INFO \| Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.566 Batch (t): 2.618, 1566.95/s, 783.475/s/gpu LR: 0.000003 Logit Scale: 16.350 Contrastive_loss: 6.1241 (6.3529) Loss: 6.1241 (6.3529)
	2024-09-07,14:52:19 \| INFO \| Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.564 Batch (t): 2.618, 1557.86/s, 778.931/s/gpu LR: 0.000000 Logit Scale: 16.352 Contrastive_loss: 6.2534 (6.3387) Loss: 6.2534 (6.3387)
	2024-09-07,14:53:29 \| INFO \| Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.611, 1577.81/s, 788.906/s/gpu LR: 0.000000 Logit Scale: 16.352 Contrastive_loss: 6.4079 (6.3473) Loss: 6.4079 (6.3473)