added more trained models

cf6f528 over 1 year ago

29.7 kB

	2024-09-07,10:00:53 \| INFO \| No latest resume checkpoint found in /home/breaking_0.7_trained/70_most_difficult/checkpoints.
	2024-09-07,10:00:55 \| INFO \| Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
	2024-09-07,10:00:55 \| INFO \| Loaded ViT-B-32 model config.
	2024-09-07,10:00:56 \| INFO \| Model:
	2024-09-07,10:00:56 \| INFO \| CLIP(
	(visual): VisionTransformer(
	(patchnorm_pre_ln): Identity()
	(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
	(patch_dropout): Identity()
	(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=768, out_features=3072, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=3072, out_features=768, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
	)
	(transformer): Transformer(
	(resblocks): ModuleList(
	(0): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(1): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(2): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(3): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(4): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(5): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(6): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(7): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(8): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(9): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(10): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	(11): ResidualAttentionBlock(
	(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(attn): MultiheadAttention(
	(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
	)
	(ls_1): Identity()
	(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	(mlp): Sequential(
	(c_fc): Linear(in_features=512, out_features=2048, bias=True)
	(gelu): GELU(approximate='none')
	(c_proj): Linear(in_features=2048, out_features=512, bias=True)
	)
	(ls_2): Identity()
	)
	)
	)
	(token_embedding): Embedding(49408, 512)
	(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
	)
	2024-09-07,10:00:56 \| INFO \| Params:
	2024-09-07,10:00:56 \| INFO \| accum_freq: 1
	2024-09-07,10:00:56 \| INFO \| aug_cfg: {}
	2024-09-07,10:00:56 \| INFO \| batch_size: 2048
	2024-09-07,10:00:56 \| INFO \| beta1: 0.9
	2024-09-07,10:00:56 \| INFO \| beta2: 0.98
	2024-09-07,10:00:56 \| INFO \| checkpoint_path: /home/breaking_0.7_trained/70_most_difficult/checkpoints
	2024-09-07,10:00:56 \| INFO \| coca_caption_loss_weight: 2.0
	2024-09-07,10:00:56 \| INFO \| coca_contrastive_loss_weight: 1.0
	2024-09-07,10:00:56 \| INFO \| copy_codebase: False
	2024-09-07,10:00:56 \| INFO \| csv_caption_key: title
	2024-09-07,10:00:56 \| INFO \| csv_img_key: filepath
	2024-09-07,10:00:56 \| INFO \| csv_separator:
	2024-09-07,10:00:56 \| INFO \| dataset_resampled: True
	2024-09-07,10:00:56 \| INFO \| dataset_type: webdataset
	2024-09-07,10:00:56 \| INFO \| ddp_static_graph: True
	2024-09-07,10:00:56 \| INFO \| debug: False
	2024-09-07,10:00:56 \| INFO \| delete_previous_checkpoint: False
	2024-09-07,10:00:56 \| INFO \| device: cuda:0
	2024-09-07,10:00:56 \| INFO \| dist_backend: nccl
	2024-09-07,10:00:56 \| INFO \| dist_url: env://
	2024-09-07,10:00:56 \| INFO \| distill: False
	2024-09-07,10:00:56 \| INFO \| distill_model: None
	2024-09-07,10:00:56 \| INFO \| distill_pretrained: None
	2024-09-07,10:00:56 \| INFO \| distributed: True
	2024-09-07,10:00:56 \| INFO \| epochs: 5
	2024-09-07,10:00:56 \| INFO \| epochs_cooldown: None
	2024-09-07,10:00:56 \| INFO \| eps: 1e-06
	2024-09-07,10:00:56 \| INFO \| force_custom_text: False
	2024-09-07,10:00:56 \| INFO \| force_image_size: None
	2024-09-07,10:00:56 \| INFO \| force_patch_dropout: None
	2024-09-07,10:00:56 \| INFO \| force_quick_gelu: False
	2024-09-07,10:00:56 \| INFO \| gather_with_grad: True
	2024-09-07,10:00:56 \| INFO \| grad_checkpointing: True
	2024-09-07,10:00:56 \| INFO \| grad_clip_norm: None
	2024-09-07,10:00:56 \| INFO \| horovod: False
	2024-09-07,10:00:56 \| INFO \| image_mean: None
	2024-09-07,10:00:56 \| INFO \| image_std: None
	2024-09-07,10:00:56 \| INFO \| imagenet_v2: None
	2024-09-07,10:00:56 \| INFO \| imagenet_val: None
	2024-09-07,10:00:56 \| INFO \| local_loss: True
	2024-09-07,10:00:56 \| INFO \| local_rank: 0
	2024-09-07,10:00:56 \| INFO \| lock_image: False
	2024-09-07,10:00:56 \| INFO \| lock_image_freeze_bn_stats: False
	2024-09-07,10:00:56 \| INFO \| lock_image_unlocked_groups: 0
	2024-09-07,10:00:56 \| INFO \| lock_text: False
	2024-09-07,10:00:56 \| INFO \| lock_text_freeze_layer_norm: False
	2024-09-07,10:00:56 \| INFO \| lock_text_unlocked_layers: 0
	2024-09-07,10:00:56 \| INFO \| log_every_n_steps: 100
	2024-09-07,10:00:56 \| INFO \| log_level: 20
	2024-09-07,10:00:56 \| INFO \| log_local: False
	2024-09-07,10:00:56 \| INFO \| log_path: /home/breaking_0.7_trained/70_most_difficult/out.log
	2024-09-07,10:00:56 \| INFO \| logs: /home/breaking_0.7_trained
	2024-09-07,10:00:56 \| INFO \| lr: 0.0005
	2024-09-07,10:00:56 \| INFO \| lr_cooldown_end: 0.0
	2024-09-07,10:00:56 \| INFO \| lr_cooldown_power: 1.0
	2024-09-07,10:00:56 \| INFO \| lr_scheduler: cosine
	2024-09-07,10:00:56 \| INFO \| model: ViT-B-32
	2024-09-07,10:00:56 \| INFO \| name: 70_most_difficult
	2024-09-07,10:00:56 \| INFO \| no_set_device_rank: False
	2024-09-07,10:00:56 \| INFO \| precision: amp
	2024-09-07,10:00:56 \| INFO \| pretrained:
	2024-09-07,10:00:56 \| INFO \| pretrained_image: False
	2024-09-07,10:00:56 \| INFO \| rank: 0
	2024-09-07,10:00:56 \| INFO \| remote_sync: None
	2024-09-07,10:00:56 \| INFO \| remote_sync_frequency: 300
	2024-09-07,10:00:56 \| INFO \| remote_sync_protocol: s3
	2024-09-07,10:00:56 \| INFO \| report_to: wandb
	2024-09-07,10:00:56 \| INFO \| resume: None
	2024-09-07,10:00:56 \| INFO \| save_frequency: 0
	2024-09-07,10:00:56 \| INFO \| save_most_recent: True
	2024-09-07,10:00:56 \| INFO \| seed: 0
	2024-09-07,10:00:56 \| INFO \| skip_scheduler: False
	2024-09-07,10:00:56 \| INFO \| tensorboard: False
	2024-09-07,10:00:56 \| INFO \| tensorboard_path:
	2024-09-07,10:00:56 \| INFO \| torchscript: False
	2024-09-07,10:00:56 \| INFO \| trace: False
	2024-09-07,10:00:56 \| INFO \| train_data: /home/breaking_0.7/{00000000..00000763}.tar
	2024-09-07,10:00:56 \| INFO \| train_data_upsampling_factors: None
	2024-09-07,10:00:56 \| INFO \| train_num_samples: 2560000
	2024-09-07,10:00:56 \| INFO \| use_bn_sync: False
	2024-09-07,10:00:56 \| INFO \| val_data: None
	2024-09-07,10:00:56 \| INFO \| val_frequency: 1
	2024-09-07,10:00:56 \| INFO \| val_num_samples: None
	2024-09-07,10:00:56 \| INFO \| wandb: True
	2024-09-07,10:00:56 \| INFO \| wandb_notes:
	2024-09-07,10:00:56 \| INFO \| wandb_project_name: clip_text_hq_clusters
	2024-09-07,10:00:56 \| INFO \| warmup: 500
	2024-09-07,10:00:56 \| INFO \| wd: 0.2
	2024-09-07,10:00:56 \| INFO \| workers: 4
	2024-09-07,10:00:56 \| INFO \| world_size: 2
	2024-09-07,10:00:56 \| INFO \| zeroshot_frequency: 2
	2024-09-07,10:01:02 \| INFO \| Start epoch 0
	2024-09-07,10:01:19 \| INFO \| Train Epoch: 0 [ 4096/2572288 (0%)] Data (t): 12.092 Batch (t): 16.765, 244.323/s, 122.161/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3788 (8.3788) Loss: 8.3788 (8.3788)
	2024-09-07,10:01:22 \| INFO \| Reducer buckets have been rebuilt in this iteration.
	2024-09-07,10:05:39 \| INFO \| Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.547 Batch (t): 2.602, 1574.29/s, 787.144/s/gpu LR: 0.000101 Logit Scale: 14.266 Contrastive_loss: 8.2004 (8.2896) Loss: 8.2004 (8.2896)
	2024-09-07,10:10:01 \| INFO \| Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.614, 1566.47/s, 783.235/s/gpu LR: 0.000201 Logit Scale: 14.231 Contrastive_loss: 8.0909 (8.2234) Loss: 8.0909 (8.2234)
	2024-09-07,10:14:22 \| INFO \| Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.566 Batch (t): 2.616, 1572.98/s, 786.491/s/gpu LR: 0.000301 Logit Scale: 14.194 Contrastive_loss: 7.9780 (8.1620) Loss: 7.9780 (8.1620)
	2024-09-07,10:18:44 \| INFO \| Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.615, 1563.77/s, 781.886/s/gpu LR: 0.000401 Logit Scale: 14.152 Contrastive_loss: 7.8635 (8.1023) Loss: 7.8635 (8.1023)
	2024-09-07,10:23:06 \| INFO \| Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.616, 1563.40/s, 781.699/s/gpu LR: 0.000500 Logit Scale: 14.108 Contrastive_loss: 7.8084 (8.0534) Loss: 7.8084 (8.0534)
	2024-09-07,10:27:28 \| INFO \| Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.573 Batch (t): 2.623, 1561.55/s, 780.775/s/gpu LR: 0.000498 Logit Scale: 14.085 Contrastive_loss: 7.7462 (8.0095) Loss: 7.7462 (8.0095)
	2024-09-07,10:28:38 \| INFO \| Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.566 Batch (t): 2.617, 1568.93/s, 784.465/s/gpu LR: 0.000497 Logit Scale: 14.085 Contrastive_loss: 7.6566 (7.9654) Loss: 7.6566 (7.9654)
	2024-09-07,10:28:41 \| INFO \| Start epoch 1
	2024-09-07,10:28:53 \| INFO \| Train Epoch: 1 [ 4096/2572288 (0%)] Data (t): 9.646 Batch (t): 11.692, 350.330/s, 175.165/s/gpu LR: 0.000497 Logit Scale: 14.085 Contrastive_loss: 7.6610 (7.6610) Loss: 7.6610 (7.6610)
	2024-09-07,10:33:13 \| INFO \| Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.606, 1563.83/s, 781.915/s/gpu LR: 0.000491 Logit Scale: 14.097 Contrastive_loss: 7.5724 (7.6167) Loss: 7.5724 (7.6167)
	2024-09-07,10:37:35 \| INFO \| Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.615, 1566.09/s, 783.044/s/gpu LR: 0.000481 Logit Scale: 14.127 Contrastive_loss: 7.4356 (7.5563) Loss: 7.4356 (7.5563)
	2024-09-07,10:41:56 \| INFO \| Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.566 Batch (t): 2.616, 1558.96/s, 779.482/s/gpu LR: 0.000468 Logit Scale: 14.170 Contrastive_loss: 7.3573 (7.5066) Loss: 7.3573 (7.5066)
	2024-09-07,10:46:18 \| INFO \| Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.568 Batch (t): 2.620, 1562.54/s, 781.271/s/gpu LR: 0.000452 Logit Scale: 14.245 Contrastive_loss: 7.4000 (7.4853) Loss: 7.4000 (7.4853)
	2024-09-07,10:50:41 \| INFO \| Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.571 Batch (t): 2.622, 1563.27/s, 781.634/s/gpu LR: 0.000433 Logit Scale: 14.335 Contrastive_loss: 7.2466 (7.4455) Loss: 7.2466 (7.4455)
	2024-09-07,10:55:03 \| INFO \| Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.570 Batch (t): 2.620, 1566.36/s, 783.178/s/gpu LR: 0.000412 Logit Scale: 14.443 Contrastive_loss: 7.2259 (7.4141) Loss: 7.2259 (7.4141)
	2024-09-07,10:56:13 \| INFO \| Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.614, 1574.96/s, 787.481/s/gpu LR: 0.000406 Logit Scale: 14.478 Contrastive_loss: 7.1533 (7.3815) Loss: 7.1533 (7.3815)
	2024-09-07,10:56:16 \| INFO \| Start epoch 2
	2024-09-07,10:56:28 \| INFO \| Train Epoch: 2 [ 4096/2572288 (0%)] Data (t): 9.651 Batch (t): 11.699, 350.112/s, 175.056/s/gpu LR: 0.000405 Logit Scale: 14.480 Contrastive_loss: 6.9992 (6.9992) Loss: 6.9992 (6.9992)
	2024-09-07,11:00:49 \| INFO \| Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.558 Batch (t): 2.617, 1562.06/s, 781.032/s/gpu LR: 0.000381 Logit Scale: 14.608 Contrastive_loss: 7.1339 (7.0665) Loss: 7.1339 (7.0665)
	2024-09-07,11:05:12 \| INFO \| Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.569 Batch (t): 2.622, 1557.91/s, 778.953/s/gpu LR: 0.000355 Logit Scale: 14.768 Contrastive_loss: 7.0686 (7.0672) Loss: 7.0686 (7.0672)
	2024-09-07,11:09:34 \| INFO \| Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.623, 1565.95/s, 782.973/s/gpu LR: 0.000327 Logit Scale: 14.891 Contrastive_loss: 6.9274 (7.0323) Loss: 6.9274 (7.0323)
	2024-09-07,11:13:56 \| INFO \| Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.569 Batch (t): 2.621, 1563.01/s, 781.503/s/gpu LR: 0.000298 Logit Scale: 15.027 Contrastive_loss: 6.8516 (6.9961) Loss: 6.8516 (6.9961)
	2024-09-07,11:18:18 \| INFO \| Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.568 Batch (t): 2.620, 1559.58/s, 779.792/s/gpu LR: 0.000269 Logit Scale: 15.198 Contrastive_loss: 6.9052 (6.9810) Loss: 6.9052 (6.9810)
	2024-09-07,11:22:40 \| INFO \| Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.568 Batch (t): 2.621, 1561.78/s, 780.890/s/gpu LR: 0.000239 Logit Scale: 15.340 Contrastive_loss: 6.7375 (6.9462) Loss: 6.7375 (6.9462)
	2024-09-07,11:23:51 \| INFO \| Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.567 Batch (t): 2.619, 1575.79/s, 787.897/s/gpu LR: 0.000231 Logit Scale: 15.374 Contrastive_loss: 6.8204 (6.9305) Loss: 6.8204 (6.9305)
	2024-09-07,11:23:54 \| INFO \| Start epoch 3
	2024-09-07,11:24:05 \| INFO \| Train Epoch: 3 [ 4096/2572288 (0%)] Data (t): 9.380 Batch (t): 11.428, 358.426/s, 179.213/s/gpu LR: 0.000231 Logit Scale: 15.375 Contrastive_loss: 6.6847 (6.6847) Loss: 6.6847 (6.6847)
	2024-09-07,11:28:26 \| INFO \| Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.552 Batch (t): 2.613, 1562.80/s, 781.399/s/gpu LR: 0.000202 Logit Scale: 15.524 Contrastive_loss: 6.5905 (6.6376) Loss: 6.5905 (6.6376)
	2024-09-07,11:32:48 \| INFO \| Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.617, 1563.18/s, 781.589/s/gpu LR: 0.000173 Logit Scale: 15.666 Contrastive_loss: 6.5036 (6.5929) Loss: 6.5036 (6.5929)
	2024-09-07,11:37:10 \| INFO \| Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.620, 1562.99/s, 781.497/s/gpu LR: 0.000145 Logit Scale: 15.784 Contrastive_loss: 6.2833 (6.5155) Loss: 6.2833 (6.5155)
	2024-09-07,11:41:32 \| INFO \| Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.618, 1568.86/s, 784.430/s/gpu LR: 0.000119 Logit Scale: 15.895 Contrastive_loss: 6.2988 (6.4722) Loss: 6.2988 (6.4722)
	2024-09-07,11:45:54 \| INFO \| Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.564 Batch (t): 2.617, 1566.40/s, 783.198/s/gpu LR: 0.000095 Logit Scale: 16.002 Contrastive_loss: 6.3952 (6.4594) Loss: 6.3952 (6.4594)
	2024-09-07,11:50:16 \| INFO \| Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.566 Batch (t): 2.619, 1568.68/s, 784.342/s/gpu LR: 0.000072 Logit Scale: 16.096 Contrastive_loss: 6.1727 (6.4184) Loss: 6.1727 (6.4184)
	2024-09-07,11:51:26 \| INFO \| Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.567 Batch (t): 2.619, 1575.88/s, 787.941/s/gpu LR: 0.000067 Logit Scale: 16.118 Contrastive_loss: 6.3202 (6.4061) Loss: 6.3202 (6.4061)
	2024-09-07,11:51:29 \| INFO \| Start epoch 4
	2024-09-07,11:51:41 \| INFO \| Train Epoch: 4 [ 4096/2572288 (0%)] Data (t): 9.530 Batch (t): 11.577, 353.813/s, 176.906/s/gpu LR: 0.000067 Logit Scale: 16.118 Contrastive_loss: 6.2788 (6.2788) Loss: 6.2788 (6.2788)
	2024-09-07,11:56:02 \| INFO \| Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.554 Batch (t): 2.615, 1562.67/s, 781.334/s/gpu LR: 0.000048 Logit Scale: 16.177 Contrastive_loss: 6.4553 (6.3671) Loss: 6.4553 (6.3671)
	2024-09-07,12:00:24 \| INFO \| Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.563 Batch (t): 2.616, 1563.44/s, 781.721/s/gpu LR: 0.000032 Logit Scale: 16.220 Contrastive_loss: 6.3225 (6.3522) Loss: 6.3225 (6.3522)
	2024-09-07,12:04:45 \| INFO \| Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.563 Batch (t): 2.615, 1567.49/s, 783.743/s/gpu LR: 0.000019 Logit Scale: 16.246 Contrastive_loss: 6.2155 (6.3180) Loss: 6.2155 (6.3180)
	2024-09-07,12:09:07 \| INFO \| Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.619, 1563.51/s, 781.753/s/gpu LR: 0.000009 Logit Scale: 16.261 Contrastive_loss: 6.4286 (6.3401) Loss: 6.4286 (6.3401)
	2024-09-07,12:13:29 \| INFO \| Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.563 Batch (t): 2.616, 1565.01/s, 782.504/s/gpu LR: 0.000003 Logit Scale: 16.267 Contrastive_loss: 5.7948 (6.2493) Loss: 5.7948 (6.2493)
	2024-09-07,12:17:51 \| INFO \| Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.564 Batch (t): 2.618, 1563.70/s, 781.850/s/gpu LR: 0.000000 Logit Scale: 16.268 Contrastive_loss: 6.4187 (6.2735) Loss: 6.4187 (6.2735)
	2024-09-07,12:19:01 \| INFO \| Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.615, 1572.01/s, 786.003/s/gpu LR: 0.000000 Logit Scale: 16.268 Contrastive_loss: 6.3365 (6.2813) Loss: 6.3365 (6.2813)