snats commited on Sep 7, 2024

Commit

cf6f528

1 Parent(s): 3e7991a

added more trained models

Browse files

Files changed (20) hide show

breaking_0.2_trained/20_most_difficult/checkpoints/epoch_5.pt +3 -0
breaking_0.2_trained/20_most_difficult/checkpoints/epoch_latest.pt +3 -0
breaking_0.2_trained/20_most_difficult/info.pkl +3 -0
breaking_0.2_trained/20_most_difficult/out.log +497 -0
breaking_0.2_trained/20_most_difficult/params.txt +91 -0
breaking_0.3_trained/30_most_difficult/checkpoints/epoch_5.pt +3 -0
breaking_0.3_trained/30_most_difficult/checkpoints/epoch_latest.pt +3 -0
breaking_0.3_trained/30_most_difficult/info.pkl +3 -0
breaking_0.3_trained/30_most_difficult/out.log +497 -0
breaking_0.3_trained/30_most_difficult/params.txt +91 -0
breaking_0.7_trained/70_most_difficult/checkpoints/epoch_5.pt +3 -0
breaking_0.7_trained/70_most_difficult/checkpoints/epoch_latest.pt +3 -0
breaking_0.7_trained/70_most_difficult/info.pkl +3 -0
breaking_0.7_trained/70_most_difficult/out.log +497 -0
breaking_0.7_trained/70_most_difficult/params.txt +91 -0
breaking_0.9_trained/90_most_difficult/checkpoints/epoch_5.pt +3 -0
breaking_0.9_trained/90_most_difficult/checkpoints/epoch_latest.pt +3 -0
breaking_0.9_trained/90_most_difficult/info.pkl +3 -0
breaking_0.9_trained/90_most_difficult/out.log +497 -0
breaking_0.9_trained/90_most_difficult/params.txt +91 -0

breaking_0.2_trained/20_most_difficult/checkpoints/epoch_5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:677af8c29c8be40887616015bf5d8d41ab562351cf71f037f2651f6e8b45d77f
+size 1815701601

breaking_0.2_trained/20_most_difficult/checkpoints/epoch_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:448f8f84352a9c8f41df8aa124708898de3a777613e59dbc9a594befb5283d3b
+size 1815639289

breaking_0.2_trained/20_most_difficult/info.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c5ec2de9dd3837c0bbfd15d07d911b04aa4588bbc2b6e2182df239784caa842
+size 321

breaking_0.2_trained/20_most_difficult/out.log ADDED Viewed

	@@ -0,0 +1,497 @@

+2024-09-07,15:01:55 | INFO | No latest resume checkpoint found in /home/breaking_0.2_trained/20_most_difficult/checkpoints.
+2024-09-07,15:01:57 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
+2024-09-07,15:01:57 | INFO | Loaded ViT-B-32 model config.
+2024-09-07,15:01:58 | INFO | Model:
+2024-09-07,15:01:58 | INFO | CLIP(
+  (visual): VisionTransformer(
+    (patchnorm_pre_ln): Identity()
+    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
+    (patch_dropout): Identity()
+    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+    (transformer): Transformer(
+      (resblocks): ModuleList(
+        (0): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (1): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (2): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (3): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (4): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (5): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (6): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (7): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (8): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (9): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (10): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (11): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+      )
+    )
+    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+  )
+  (transformer): Transformer(
+    (resblocks): ModuleList(
+      (0): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (1): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (2): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (3): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (4): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (5): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (6): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (7): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (8): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (9): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (10): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (11): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+    )
+  )
+  (token_embedding): Embedding(49408, 512)
+  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+)
+2024-09-07,15:01:58 | INFO | Params:
+2024-09-07,15:01:58 | INFO |   accum_freq: 1
+2024-09-07,15:01:58 | INFO |   aug_cfg: {}
+2024-09-07,15:01:58 | INFO |   batch_size: 2048
+2024-09-07,15:01:58 | INFO |   beta1: 0.9
+2024-09-07,15:01:58 | INFO |   beta2: 0.98
+2024-09-07,15:01:58 | INFO |   checkpoint_path: /home/breaking_0.2_trained/20_most_difficult/checkpoints
+2024-09-07,15:01:58 | INFO |   coca_caption_loss_weight: 2.0
+2024-09-07,15:01:58 | INFO |   coca_contrastive_loss_weight: 1.0
+2024-09-07,15:01:58 | INFO |   copy_codebase: False
+2024-09-07,15:01:58 | INFO |   csv_caption_key: title
+2024-09-07,15:01:58 | INFO |   csv_img_key: filepath
+2024-09-07,15:01:58 | INFO |   csv_separator:
+2024-09-07,15:01:58 | INFO |   dataset_resampled: True
+2024-09-07,15:01:58 | INFO |   dataset_type: webdataset
+2024-09-07,15:01:58 | INFO |   ddp_static_graph: True
+2024-09-07,15:01:58 | INFO |   debug: False
+2024-09-07,15:01:58 | INFO |   delete_previous_checkpoint: False
+2024-09-07,15:01:58 | INFO |   device: cuda:0
+2024-09-07,15:01:58 | INFO |   dist_backend: nccl
+2024-09-07,15:01:58 | INFO |   dist_url: env://
+2024-09-07,15:01:58 | INFO |   distill: False
+2024-09-07,15:01:58 | INFO |   distill_model: None
+2024-09-07,15:01:58 | INFO |   distill_pretrained: None
+2024-09-07,15:01:58 | INFO |   distributed: True
+2024-09-07,15:01:58 | INFO |   epochs: 5
+2024-09-07,15:01:58 | INFO |   epochs_cooldown: None
+2024-09-07,15:01:58 | INFO |   eps: 1e-06
+2024-09-07,15:01:58 | INFO |   force_custom_text: False
+2024-09-07,15:01:58 | INFO |   force_image_size: None
+2024-09-07,15:01:58 | INFO |   force_patch_dropout: None
+2024-09-07,15:01:58 | INFO |   force_quick_gelu: False
+2024-09-07,15:01:58 | INFO |   gather_with_grad: True
+2024-09-07,15:01:58 | INFO |   grad_checkpointing: True
+2024-09-07,15:01:58 | INFO |   grad_clip_norm: None
+2024-09-07,15:01:58 | INFO |   horovod: False
+2024-09-07,15:01:58 | INFO |   image_mean: None
+2024-09-07,15:01:58 | INFO |   image_std: None
+2024-09-07,15:01:58 | INFO |   imagenet_v2: None
+2024-09-07,15:01:58 | INFO |   imagenet_val: None
+2024-09-07,15:01:58 | INFO |   local_loss: True
+2024-09-07,15:01:58 | INFO |   local_rank: 0
+2024-09-07,15:01:58 | INFO |   lock_image: False
+2024-09-07,15:01:58 | INFO |   lock_image_freeze_bn_stats: False
+2024-09-07,15:01:58 | INFO |   lock_image_unlocked_groups: 0
+2024-09-07,15:01:58 | INFO |   lock_text: False
+2024-09-07,15:01:58 | INFO |   lock_text_freeze_layer_norm: False
+2024-09-07,15:01:58 | INFO |   lock_text_unlocked_layers: 0
+2024-09-07,15:01:58 | INFO |   log_every_n_steps: 100
+2024-09-07,15:01:58 | INFO |   log_level: 20
+2024-09-07,15:01:58 | INFO |   log_local: False
+2024-09-07,15:01:58 | INFO |   log_path: /home/breaking_0.2_trained/20_most_difficult/out.log
+2024-09-07,15:01:58 | INFO |   logs: /home/breaking_0.2_trained
+2024-09-07,15:01:58 | INFO |   lr: 0.0005
+2024-09-07,15:01:58 | INFO |   lr_cooldown_end: 0.0
+2024-09-07,15:01:58 | INFO |   lr_cooldown_power: 1.0
+2024-09-07,15:01:58 | INFO |   lr_scheduler: cosine
+2024-09-07,15:01:58 | INFO |   model: ViT-B-32
+2024-09-07,15:01:58 | INFO |   name: 20_most_difficult
+2024-09-07,15:01:58 | INFO |   no_set_device_rank: False
+2024-09-07,15:01:58 | INFO |   precision: amp
+2024-09-07,15:01:58 | INFO |   pretrained:
+2024-09-07,15:01:58 | INFO |   pretrained_image: False
+2024-09-07,15:01:58 | INFO |   rank: 0
+2024-09-07,15:01:58 | INFO |   remote_sync: None
+2024-09-07,15:01:58 | INFO |   remote_sync_frequency: 300
+2024-09-07,15:01:58 | INFO |   remote_sync_protocol: s3
+2024-09-07,15:01:58 | INFO |   report_to: wandb
+2024-09-07,15:01:58 | INFO |   resume: None
+2024-09-07,15:01:58 | INFO |   save_frequency: 0
+2024-09-07,15:01:58 | INFO |   save_most_recent: True
+2024-09-07,15:01:58 | INFO |   seed: 0
+2024-09-07,15:01:58 | INFO |   skip_scheduler: False
+2024-09-07,15:01:58 | INFO |   tensorboard: False
+2024-09-07,15:01:58 | INFO |   tensorboard_path:
+2024-09-07,15:01:58 | INFO |   torchscript: False
+2024-09-07,15:01:58 | INFO |   trace: False
+2024-09-07,15:01:58 | INFO |   train_data: /home/breaking_0.2/{00000000..00000255}.tar
+2024-09-07,15:01:58 | INFO |   train_data_upsampling_factors: None
+2024-09-07,15:01:58 | INFO |   train_num_samples: 2560000
+2024-09-07,15:01:58 | INFO |   use_bn_sync: False
+2024-09-07,15:01:58 | INFO |   val_data: None
+2024-09-07,15:01:58 | INFO |   val_frequency: 1
+2024-09-07,15:01:58 | INFO |   val_num_samples: None
+2024-09-07,15:01:58 | INFO |   wandb: True
+2024-09-07,15:01:58 | INFO |   wandb_notes:
+2024-09-07,15:01:58 | INFO |   wandb_project_name: clip_text_hq_clusters
+2024-09-07,15:01:58 | INFO |   warmup: 500
+2024-09-07,15:01:58 | INFO |   wd: 0.2
+2024-09-07,15:01:58 | INFO |   workers: 4
+2024-09-07,15:01:58 | INFO |   world_size: 2
+2024-09-07,15:01:58 | INFO |   zeroshot_frequency: 2
+2024-09-07,15:02:05 | INFO | Start epoch 0
+2024-09-07,15:02:21 | INFO | Train Epoch: 0 [   4096/2572288 (0%)] Data (t): 11.718 Batch (t): 16.351, 250.499/s, 125.250/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3781 (8.3781) Loss: 8.3781 (8.3781)
+2024-09-07,15:02:24 | INFO | Reducer buckets have been rebuilt in this iteration.
+2024-09-07,15:06:41 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.550 Batch (t): 2.603, 1570.18/s, 785.090/s/gpu LR: 0.000101 Logit Scale: 14.268 Contrastive_loss: 8.2138 (8.2960) Loss: 8.2138 (8.2960)
+2024-09-07,15:11:03 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.571 Batch (t): 2.618, 1567.20/s, 783.598/s/gpu LR: 0.000201 Logit Scale: 14.239 Contrastive_loss: 8.0085 (8.2002) Loss: 8.0085 (8.2002)
+2024-09-07,15:15:25 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.619, 1562.40/s, 781.198/s/gpu LR: 0.000301 Logit Scale: 14.215 Contrastive_loss: 7.9885 (8.1472) Loss: 7.9885 (8.1472)
+2024-09-07,15:19:47 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.572 Batch (t): 2.621, 1569.02/s, 784.509/s/gpu LR: 0.000401 Logit Scale: 14.186 Contrastive_loss: 7.8949 (8.0968) Loss: 7.8949 (8.0968)
+2024-09-07,15:24:09 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.574 Batch (t): 2.623, 1559.36/s, 779.679/s/gpu LR: 0.000500 Logit Scale: 14.166 Contrastive_loss: 7.7856 (8.0449) Loss: 7.7856 (8.0449)
+2024-09-07,15:28:32 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.571 Batch (t): 2.621, 1563.18/s, 781.592/s/gpu LR: 0.000498 Logit Scale: 14.163 Contrastive_loss: 7.7069 (7.9966) Loss: 7.7069 (7.9966)
+2024-09-07,15:29:42 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.568 Batch (t): 2.617, 1567.22/s, 783.609/s/gpu LR: 0.000497 Logit Scale: 14.165 Contrastive_loss: 7.6217 (7.9498) Loss: 7.6217 (7.9498)
+2024-09-07,15:29:45 | INFO | Start epoch 1
+2024-09-07,15:29:56 | INFO | Train Epoch: 1 [   4096/2572288 (0%)] Data (t): 9.731 Batch (t): 11.775, 347.851/s, 173.926/s/gpu LR: 0.000497 Logit Scale: 14.166 Contrastive_loss: 7.7216 (7.7216) Loss: 7.7216 (7.7216)
+2024-09-07,15:34:16 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.536 Batch (t): 2.594, 1568.09/s, 784.043/s/gpu LR: 0.000491 Logit Scale: 14.175 Contrastive_loss: 7.4182 (7.5699) Loss: 7.4182 (7.5699)
+2024-09-07,15:38:37 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.615, 1567.48/s, 783.738/s/gpu LR: 0.000481 Logit Scale: 14.196 Contrastive_loss: 7.2808 (7.4736) Loss: 7.2808 (7.4736)
+2024-09-07,15:42:59 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.619, 1562.06/s, 781.028/s/gpu LR: 0.000468 Logit Scale: 14.261 Contrastive_loss: 7.1869 (7.4019) Loss: 7.1869 (7.4019)
+2024-09-07,15:47:21 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.569 Batch (t): 2.620, 1557.55/s, 778.773/s/gpu LR: 0.000452 Logit Scale: 14.318 Contrastive_loss: 7.4643 (7.4144) Loss: 7.4643 (7.4144)
+2024-09-07,15:51:44 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.571 Batch (t): 2.623, 1564.56/s, 782.281/s/gpu LR: 0.000433 Logit Scale: 14.381 Contrastive_loss: 7.1883 (7.3767) Loss: 7.1883 (7.3767)
+2024-09-07,15:56:06 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.567 Batch (t): 2.619, 1567.35/s, 783.676/s/gpu LR: 0.000412 Logit Scale: 14.487 Contrastive_loss: 6.9582 (7.3169) Loss: 6.9582 (7.3169)
+2024-09-07,15:57:16 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.562 Batch (t): 2.613, 1570.30/s, 785.148/s/gpu LR: 0.000406 Logit Scale: 14.515 Contrastive_loss: 7.2587 (7.3096) Loss: 7.2587 (7.3096)
+2024-09-07,15:57:19 | INFO | Start epoch 2
+2024-09-07,15:57:30 | INFO | Train Epoch: 2 [   4096/2572288 (0%)] Data (t): 9.654 Batch (t): 11.700, 350.096/s, 175.048/s/gpu LR: 0.000405 Logit Scale: 14.516 Contrastive_loss: 6.3539 (6.3539) Loss: 6.3539 (6.3539)
+2024-09-07,16:01:51 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.545 Batch (t): 2.603, 1569.08/s, 784.538/s/gpu LR: 0.000381 Logit Scale: 14.594 Contrastive_loss: 6.7739 (6.5639) Loss: 6.7739 (6.5639)
+2024-09-07,16:06:12 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.562 Batch (t): 2.614, 1562.66/s, 781.332/s/gpu LR: 0.000355 Logit Scale: 14.707 Contrastive_loss: 7.1770 (6.7683) Loss: 7.1770 (6.7683)
+2024-09-07,16:10:34 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.564 Batch (t): 2.616, 1568.04/s, 784.021/s/gpu LR: 0.000327 Logit Scale: 14.817 Contrastive_loss: 5.8889 (6.5484) Loss: 5.8889 (6.5484)
+2024-09-07,16:14:55 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.617, 1552.16/s, 776.082/s/gpu LR: 0.000298 Logit Scale: 14.941 Contrastive_loss: 6.4689 (6.5325) Loss: 6.4689 (6.5325)
+2024-09-07,16:19:17 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.568 Batch (t): 2.619, 1567.95/s, 783.973/s/gpu LR: 0.000269 Logit Scale: 15.081 Contrastive_loss: 6.7018 (6.5607) Loss: 6.7018 (6.5607)
+2024-09-07,16:23:39 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.567 Batch (t): 2.619, 1570.34/s, 785.171/s/gpu LR: 0.000239 Logit Scale: 15.253 Contrastive_loss: 5.6558 (6.4315) Loss: 5.6558 (6.4315)
+2024-09-07,16:24:50 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.569 Batch (t): 2.619, 1577.17/s, 788.586/s/gpu LR: 0.000231 Logit Scale: 15.288 Contrastive_loss: 5.9572 (6.3722) Loss: 5.9572 (6.3722)
+2024-09-07,16:24:53 | INFO | Start epoch 3
+2024-09-07,16:25:04 | INFO | Train Epoch: 3 [   4096/2572288 (0%)] Data (t): 9.601 Batch (t): 11.647, 351.669/s, 175.834/s/gpu LR: 0.000231 Logit Scale: 15.289 Contrastive_loss: 5.2771 (5.2771) Loss: 5.2771 (5.2771)
+2024-09-07,16:29:25 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.545 Batch (t): 2.604, 1568.38/s, 784.188/s/gpu LR: 0.000202 Logit Scale: 15.412 Contrastive_loss: 5.9988 (5.6380) Loss: 5.9988 (5.6380)
+2024-09-07,16:33:47 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.566 Batch (t): 2.618, 1560.75/s, 780.377/s/gpu LR: 0.000173 Logit Scale: 15.521 Contrastive_loss: 5.4016 (5.5592) Loss: 5.4016 (5.5592)
+2024-09-07,16:38:09 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.623, 1564.46/s, 782.230/s/gpu LR: 0.000145 Logit Scale: 15.619 Contrastive_loss: 5.8930 (5.6426) Loss: 5.8930 (5.6426)
+2024-09-07,16:42:31 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.571 Batch (t): 2.624, 1563.74/s, 781.871/s/gpu LR: 0.000119 Logit Scale: 15.715 Contrastive_loss: 5.6896 (5.6520) Loss: 5.6896 (5.6520)
+2024-09-07,16:46:53 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.570 Batch (t): 2.622, 1563.71/s, 781.856/s/gpu LR: 0.000095 Logit Scale: 15.825 Contrastive_loss: 4.6158 (5.4793) Loss: 4.6158 (5.4793)
+2024-09-07,16:51:16 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.569 Batch (t): 2.621, 1568.02/s, 784.009/s/gpu LR: 0.000072 Logit Scale: 15.916 Contrastive_loss: 7.0717 (5.7068) Loss: 7.0717 (5.7068)
+2024-09-07,16:52:26 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.568 Batch (t): 2.619, 1574.39/s, 787.197/s/gpu LR: 0.000067 Logit Scale: 15.930 Contrastive_loss: 4.7214 (5.5836) Loss: 4.7214 (5.5836)
+2024-09-07,16:52:29 | INFO | Start epoch 4
+2024-09-07,16:52:41 | INFO | Train Epoch: 4 [   4096/2572288 (0%)] Data (t): 9.741 Batch (t): 11.786, 347.518/s, 173.759/s/gpu LR: 0.000067 Logit Scale: 15.931 Contrastive_loss: 5.0148 (5.0148) Loss: 5.0148 (5.0148)
+2024-09-07,16:57:02 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.547 Batch (t): 2.608, 1560.72/s, 780.360/s/gpu LR: 0.000048 Logit Scale: 16.003 Contrastive_loss: 4.1838 (4.5993) Loss: 4.1838 (4.5993)
+2024-09-07,17:01:23 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.618, 1567.23/s, 783.616/s/gpu LR: 0.000032 Logit Scale: 16.047 Contrastive_loss: 6.0231 (5.0739) Loss: 6.0231 (5.0739)
+2024-09-07,17:05:45 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.619, 1568.47/s, 784.237/s/gpu LR: 0.000019 Logit Scale: 16.070 Contrastive_loss: 3.8941 (4.7790) Loss: 3.8941 (4.7790)
+2024-09-07,17:10:07 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.569 Batch (t): 2.622, 1563.62/s, 781.809/s/gpu LR: 0.000009 Logit Scale: 16.084 Contrastive_loss: 4.7862 (4.7804) Loss: 4.7862 (4.7804)
+2024-09-07,17:14:29 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.561 Batch (t): 2.613, 1568.88/s, 784.442/s/gpu LR: 0.000003 Logit Scale: 16.091 Contrastive_loss: 4.5866 (4.7481) Loss: 4.5866 (4.7481)
+2024-09-07,17:18:51 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.566 Batch (t): 2.618, 1566.74/s, 783.371/s/gpu LR: 0.000000 Logit Scale: 16.092 Contrastive_loss: 4.5607 (4.7213) Loss: 4.5607 (4.7213)
+2024-09-07,17:20:01 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.615, 1575.18/s, 787.591/s/gpu LR: 0.000000 Logit Scale: 16.092 Contrastive_loss: 4.8199 (4.7336) Loss: 4.8199 (4.7336)

breaking_0.2_trained/20_most_difficult/params.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+accum_freq: 1
+aug_cfg: {}
+batch_size: 2048
+beta1: 0.9
+beta2: 0.98
+checkpoint_path: /home/breaking_0.2_trained/20_most_difficult/checkpoints
+coca_caption_loss_weight: 2.0
+coca_contrastive_loss_weight: 1.0
+copy_codebase: False
+csv_caption_key: title
+csv_img_key: filepath
+csv_separator:
+dataset_resampled: True
+dataset_type: webdataset
+ddp_static_graph: True
+debug: False
+delete_previous_checkpoint: False
+device: cuda:0
+dist_backend: nccl
+dist_url: env://
+distill: False
+distill_model: None
+distill_pretrained: None
+distributed: True
+epochs: 5
+epochs_cooldown: None
+eps: 1e-06
+force_custom_text: False
+force_image_size: None
+force_patch_dropout: None
+force_quick_gelu: False
+gather_with_grad: True
+grad_checkpointing: True
+grad_clip_norm: None
+horovod: False
+image_mean: None
+image_std: None
+imagenet_v2: None
+imagenet_val: None
+local_loss: True
+local_rank: 0
+lock_image: False
+lock_image_freeze_bn_stats: False
+lock_image_unlocked_groups: 0
+lock_text: False
+lock_text_freeze_layer_norm: False
+lock_text_unlocked_layers: 0
+log_every_n_steps: 100
+log_level: 20
+log_local: False
+log_path: /home/breaking_0.2_trained/20_most_difficult/out.log
+logs: /home/breaking_0.2_trained
+lr: 0.0005
+lr_cooldown_end: 0.0
+lr_cooldown_power: 1.0
+lr_scheduler: cosine
+model: ViT-B-32
+name: 20_most_difficult
+no_set_device_rank: False
+precision: amp
+pretrained:
+pretrained_image: False
+rank: 0
+remote_sync: None
+remote_sync_frequency: 300
+remote_sync_protocol: s3
+report_to: wandb
+resume: None
+save_frequency: 0
+save_most_recent: True
+seed: 0
+skip_scheduler: False
+tensorboard: False
+tensorboard_path:
+torchscript: False
+trace: False
+train_data: /home/breaking_0.2/{00000000..00000255}.tar
+train_data_upsampling_factors: None
+train_num_samples: 2560000
+use_bn_sync: False
+val_data: None
+val_frequency: 1
+val_num_samples: None
+wandb: True
+wandb_notes:
+wandb_project_name: clip_text_hq_clusters
+warmup: 500
+wd: 0.2
+workers: 4
+world_size: 2
+zeroshot_frequency: 2

breaking_0.3_trained/30_most_difficult/checkpoints/epoch_5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3f0e909e344a50a759f490b53fbaf3ab0bb0fb2364fd4cec2500135d0a4c9b3
+size 1815701601

breaking_0.3_trained/30_most_difficult/checkpoints/epoch_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34575a2cda748dc2c29bba7ac0443fa8015e858659b59c886e3e1b0646d6a3fb
+size 1815639289

breaking_0.3_trained/30_most_difficult/info.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e3ea02871cba186f32fa465005ea43a506242185c4f13a99549cfaec303d622
+size 321

breaking_0.3_trained/30_most_difficult/out.log ADDED Viewed

	@@ -0,0 +1,497 @@

+2024-09-07,07:15:56 | INFO | No latest resume checkpoint found in /home/breaking_0.3_trained/30_most_difficult/checkpoints.
+2024-09-07,07:15:57 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
+2024-09-07,07:15:57 | INFO | Loaded ViT-B-32 model config.
+2024-09-07,07:15:58 | INFO | Model:
+2024-09-07,07:15:58 | INFO | CLIP(
+  (visual): VisionTransformer(
+    (patchnorm_pre_ln): Identity()
+    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
+    (patch_dropout): Identity()
+    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+    (transformer): Transformer(
+      (resblocks): ModuleList(
+        (0): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (1): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (2): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (3): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (4): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (5): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (6): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (7): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (8): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (9): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (10): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (11): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+      )
+    )
+    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+  )
+  (transformer): Transformer(
+    (resblocks): ModuleList(
+      (0): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (1): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (2): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (3): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (4): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (5): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (6): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (7): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (8): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (9): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (10): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (11): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+    )
+  )
+  (token_embedding): Embedding(49408, 512)
+  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+)
+2024-09-07,07:15:58 | INFO | Params:
+2024-09-07,07:15:58 | INFO |   accum_freq: 1
+2024-09-07,07:15:58 | INFO |   aug_cfg: {}
+2024-09-07,07:15:58 | INFO |   batch_size: 2048
+2024-09-07,07:15:58 | INFO |   beta1: 0.9
+2024-09-07,07:15:58 | INFO |   beta2: 0.98
+2024-09-07,07:15:58 | INFO |   checkpoint_path: /home/breaking_0.3_trained/30_most_difficult/checkpoints
+2024-09-07,07:15:58 | INFO |   coca_caption_loss_weight: 2.0
+2024-09-07,07:15:58 | INFO |   coca_contrastive_loss_weight: 1.0
+2024-09-07,07:15:58 | INFO |   copy_codebase: False
+2024-09-07,07:15:58 | INFO |   csv_caption_key: title
+2024-09-07,07:15:58 | INFO |   csv_img_key: filepath
+2024-09-07,07:15:58 | INFO |   csv_separator:
+2024-09-07,07:15:58 | INFO |   dataset_resampled: True
+2024-09-07,07:15:58 | INFO |   dataset_type: webdataset
+2024-09-07,07:15:58 | INFO |   ddp_static_graph: True
+2024-09-07,07:15:58 | INFO |   debug: False
+2024-09-07,07:15:58 | INFO |   delete_previous_checkpoint: False
+2024-09-07,07:15:58 | INFO |   device: cuda:0
+2024-09-07,07:15:58 | INFO |   dist_backend: nccl
+2024-09-07,07:15:58 | INFO |   dist_url: env://
+2024-09-07,07:15:58 | INFO |   distill: False
+2024-09-07,07:15:58 | INFO |   distill_model: None
+2024-09-07,07:15:58 | INFO |   distill_pretrained: None
+2024-09-07,07:15:58 | INFO |   distributed: True
+2024-09-07,07:15:58 | INFO |   epochs: 5
+2024-09-07,07:15:58 | INFO |   epochs_cooldown: None
+2024-09-07,07:15:58 | INFO |   eps: 1e-06
+2024-09-07,07:15:58 | INFO |   force_custom_text: False
+2024-09-07,07:15:58 | INFO |   force_image_size: None
+2024-09-07,07:15:58 | INFO |   force_patch_dropout: None
+2024-09-07,07:15:58 | INFO |   force_quick_gelu: False
+2024-09-07,07:15:58 | INFO |   gather_with_grad: True
+2024-09-07,07:15:58 | INFO |   grad_checkpointing: True
+2024-09-07,07:15:58 | INFO |   grad_clip_norm: None
+2024-09-07,07:15:58 | INFO |   horovod: False
+2024-09-07,07:15:58 | INFO |   image_mean: None
+2024-09-07,07:15:58 | INFO |   image_std: None
+2024-09-07,07:15:58 | INFO |   imagenet_v2: None
+2024-09-07,07:15:58 | INFO |   imagenet_val: None
+2024-09-07,07:15:58 | INFO |   local_loss: True
+2024-09-07,07:15:58 | INFO |   local_rank: 0
+2024-09-07,07:15:58 | INFO |   lock_image: False
+2024-09-07,07:15:58 | INFO |   lock_image_freeze_bn_stats: False
+2024-09-07,07:15:58 | INFO |   lock_image_unlocked_groups: 0
+2024-09-07,07:15:58 | INFO |   lock_text: False
+2024-09-07,07:15:58 | INFO |   lock_text_freeze_layer_norm: False
+2024-09-07,07:15:58 | INFO |   lock_text_unlocked_layers: 0
+2024-09-07,07:15:58 | INFO |   log_every_n_steps: 100
+2024-09-07,07:15:58 | INFO |   log_level: 20
+2024-09-07,07:15:58 | INFO |   log_local: False
+2024-09-07,07:15:58 | INFO |   log_path: /home/breaking_0.3_trained/30_most_difficult/out.log
+2024-09-07,07:15:58 | INFO |   logs: /home/breaking_0.3_trained
+2024-09-07,07:15:58 | INFO |   lr: 0.0005
+2024-09-07,07:15:58 | INFO |   lr_cooldown_end: 0.0
+2024-09-07,07:15:58 | INFO |   lr_cooldown_power: 1.0
+2024-09-07,07:15:58 | INFO |   lr_scheduler: cosine
+2024-09-07,07:15:58 | INFO |   model: ViT-B-32
+2024-09-07,07:15:58 | INFO |   name: 30_most_difficult
+2024-09-07,07:15:58 | INFO |   no_set_device_rank: False
+2024-09-07,07:15:58 | INFO |   precision: amp
+2024-09-07,07:15:58 | INFO |   pretrained:
+2024-09-07,07:15:58 | INFO |   pretrained_image: False
+2024-09-07,07:15:58 | INFO |   rank: 0
+2024-09-07,07:15:58 | INFO |   remote_sync: None
+2024-09-07,07:15:58 | INFO |   remote_sync_frequency: 300
+2024-09-07,07:15:58 | INFO |   remote_sync_protocol: s3
+2024-09-07,07:15:58 | INFO |   report_to: wandb
+2024-09-07,07:15:58 | INFO |   resume: None
+2024-09-07,07:15:58 | INFO |   save_frequency: 0
+2024-09-07,07:15:58 | INFO |   save_most_recent: True
+2024-09-07,07:15:58 | INFO |   seed: 0
+2024-09-07,07:15:58 | INFO |   skip_scheduler: False
+2024-09-07,07:15:58 | INFO |   tensorboard: False
+2024-09-07,07:15:58 | INFO |   tensorboard_path:
+2024-09-07,07:15:58 | INFO |   torchscript: False
+2024-09-07,07:15:58 | INFO |   trace: False
+2024-09-07,07:15:58 | INFO |   train_data: /home/breaking_0.3/{00000000..00000335}.tar
+2024-09-07,07:15:58 | INFO |   train_data_upsampling_factors: None
+2024-09-07,07:15:58 | INFO |   train_num_samples: 2560000
+2024-09-07,07:15:58 | INFO |   use_bn_sync: False
+2024-09-07,07:15:58 | INFO |   val_data: None
+2024-09-07,07:15:58 | INFO |   val_frequency: 1
+2024-09-07,07:15:58 | INFO |   val_num_samples: None
+2024-09-07,07:15:58 | INFO |   wandb: True
+2024-09-07,07:15:58 | INFO |   wandb_notes:
+2024-09-07,07:15:58 | INFO |   wandb_project_name: clip_text_hq_clusters
+2024-09-07,07:15:58 | INFO |   warmup: 500
+2024-09-07,07:15:58 | INFO |   wd: 0.2
+2024-09-07,07:15:58 | INFO |   workers: 4
+2024-09-07,07:15:58 | INFO |   world_size: 2
+2024-09-07,07:15:58 | INFO |   zeroshot_frequency: 2
+2024-09-07,07:16:05 | INFO | Start epoch 0
+2024-09-07,07:16:21 | INFO | Train Epoch: 0 [   4096/2572288 (0%)] Data (t): 11.587 Batch (t): 16.138, 253.817/s, 126.909/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3770 (8.3770) Loss: 8.3770 (8.3770)
+2024-09-07,07:16:24 | INFO | Reducer buckets have been rebuilt in this iteration.
+2024-09-07,07:20:41 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.546 Batch (t): 2.597, 1566.76/s, 783.379/s/gpu LR: 0.000101 Logit Scale: 14.267 Contrastive_loss: 8.2295 (8.3032) Loss: 8.2295 (8.3032)
+2024-09-07,07:25:02 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.611, 1568.76/s, 784.381/s/gpu LR: 0.000201 Logit Scale: 14.229 Contrastive_loss: 8.0991 (8.2352) Loss: 8.0991 (8.2352)
+2024-09-07,07:29:23 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.615, 1561.64/s, 780.819/s/gpu LR: 0.000301 Logit Scale: 14.203 Contrastive_loss: 8.0503 (8.1890) Loss: 8.0503 (8.1890)
+2024-09-07,07:33:45 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.570 Batch (t): 2.618, 1564.16/s, 782.079/s/gpu LR: 0.000401 Logit Scale: 14.176 Contrastive_loss: 7.9354 (8.1382) Loss: 7.9354 (8.1382)
+2024-09-07,07:38:07 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.568 Batch (t): 2.615, 1569.69/s, 784.843/s/gpu LR: 0.000500 Logit Scale: 14.147 Contrastive_loss: 7.8547 (8.0910) Loss: 7.8547 (8.0910)
+2024-09-07,07:42:28 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.567 Batch (t): 2.616, 1570.76/s, 785.381/s/gpu LR: 0.000498 Logit Scale: 14.128 Contrastive_loss: 7.7547 (8.0430) Loss: 7.7547 (8.0430)
+2024-09-07,07:43:39 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.565 Batch (t): 2.613, 1578.30/s, 789.150/s/gpu LR: 0.000497 Logit Scale: 14.121 Contrastive_loss: 7.7028 (8.0004) Loss: 7.7028 (8.0004)
+2024-09-07,07:43:41 | INFO | Start epoch 1
+2024-09-07,07:43:53 | INFO | Train Epoch: 1 [   4096/2572288 (0%)] Data (t): 9.696 Batch (t): 11.740, 348.903/s, 174.452/s/gpu LR: 0.000497 Logit Scale: 14.122 Contrastive_loss: 7.7750 (7.7750) Loss: 7.7750 (7.7750)
+2024-09-07,07:48:12 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.538 Batch (t): 2.594, 1571.17/s, 785.586/s/gpu LR: 0.000491 Logit Scale: 14.130 Contrastive_loss: 7.5770 (7.6760) Loss: 7.5770 (7.6760)
+2024-09-07,07:52:34 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.613, 1567.68/s, 783.840/s/gpu LR: 0.000481 Logit Scale: 14.150 Contrastive_loss: 7.4687 (7.6069) Loss: 7.4687 (7.6069)
+2024-09-07,07:56:55 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.616, 1565.50/s, 782.748/s/gpu LR: 0.000468 Logit Scale: 14.191 Contrastive_loss: 7.3471 (7.5420) Loss: 7.3471 (7.5420)
+2024-09-07,08:01:17 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.615, 1568.25/s, 784.125/s/gpu LR: 0.000452 Logit Scale: 14.263 Contrastive_loss: 7.2163 (7.4768) Loss: 7.2163 (7.4768)
+2024-09-07,08:05:39 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.570 Batch (t): 2.618, 1569.50/s, 784.748/s/gpu LR: 0.000433 Logit Scale: 14.350 Contrastive_loss: 7.1304 (7.4191) Loss: 7.1304 (7.4191)
+2024-09-07,08:10:00 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.568 Batch (t): 2.617, 1565.34/s, 782.672/s/gpu LR: 0.000412 Logit Scale: 14.430 Contrastive_loss: 7.1217 (7.3766) Loss: 7.1217 (7.3766)
+2024-09-07,08:11:11 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.565 Batch (t): 2.614, 1562.51/s, 781.255/s/gpu LR: 0.000406 Logit Scale: 14.452 Contrastive_loss: 7.2079 (7.3555) Loss: 7.2079 (7.3555)
+2024-09-07,08:11:14 | INFO | Start epoch 2
+2024-09-07,08:11:25 | INFO | Train Epoch: 2 [   4096/2572288 (0%)] Data (t): 9.554 Batch (t): 11.600, 353.101/s, 176.550/s/gpu LR: 0.000405 Logit Scale: 14.452 Contrastive_loss: 6.6379 (6.6379) Loss: 6.6379 (6.6379)
+2024-09-07,08:15:46 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.555 Batch (t): 2.611, 1572.32/s, 786.161/s/gpu LR: 0.000381 Logit Scale: 14.566 Contrastive_loss: 7.0270 (6.8325) Loss: 7.0270 (6.8325)
+2024-09-07,08:20:08 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.615, 1554.74/s, 777.370/s/gpu LR: 0.000355 Logit Scale: 14.681 Contrastive_loss: 7.1672 (6.9440) Loss: 7.1672 (6.9440)
+2024-09-07,08:24:29 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.566 Batch (t): 2.615, 1570.16/s, 785.078/s/gpu LR: 0.000327 Logit Scale: 14.790 Contrastive_loss: 6.8110 (6.9108) Loss: 6.8110 (6.9108)
+2024-09-07,08:28:51 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.614, 1569.48/s, 784.739/s/gpu LR: 0.000298 Logit Scale: 14.901 Contrastive_loss: 6.6025 (6.8491) Loss: 6.6025 (6.8491)
+2024-09-07,08:33:12 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.566 Batch (t): 2.615, 1565.61/s, 782.805/s/gpu LR: 0.000269 Logit Scale: 15.018 Contrastive_loss: 6.7172 (6.8271) Loss: 6.7172 (6.8271)
+2024-09-07,08:37:34 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.565 Batch (t): 2.615, 1566.20/s, 783.102/s/gpu LR: 0.000239 Logit Scale: 15.130 Contrastive_loss: 6.4575 (6.7743) Loss: 6.4575 (6.7743)
+2024-09-07,08:38:44 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.612, 1576.61/s, 788.306/s/gpu LR: 0.000231 Logit Scale: 15.173 Contrastive_loss: 6.3482 (6.7211) Loss: 6.3482 (6.7211)
+2024-09-07,08:38:47 | INFO | Start epoch 3
+2024-09-07,08:38:59 | INFO | Train Epoch: 3 [   4096/2572288 (0%)] Data (t): 9.551 Batch (t): 11.600, 353.118/s, 176.559/s/gpu LR: 0.000231 Logit Scale: 15.175 Contrastive_loss: 6.4882 (6.4882) Loss: 6.4882 (6.4882)
+2024-09-07,08:43:19 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.543 Batch (t): 2.601, 1571.86/s, 785.932/s/gpu LR: 0.000202 Logit Scale: 15.290 Contrastive_loss: 6.4287 (6.4584) Loss: 6.4287 (6.4584)
+2024-09-07,08:47:40 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.562 Batch (t): 2.613, 1568.66/s, 784.328/s/gpu LR: 0.000173 Logit Scale: 15.426 Contrastive_loss: 6.5816 (6.4995) Loss: 6.5816 (6.4995)
+2024-09-07,08:52:02 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.567 Batch (t): 2.617, 1558.61/s, 779.304/s/gpu LR: 0.000145 Logit Scale: 15.514 Contrastive_loss: 6.4312 (6.4824) Loss: 6.4312 (6.4824)
+2024-09-07,08:56:24 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.566 Batch (t): 2.618, 1567.58/s, 783.790/s/gpu LR: 0.000119 Logit Scale: 15.599 Contrastive_loss: 5.8314 (6.3522) Loss: 5.8314 (6.3522)
+2024-09-07,09:00:46 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.570 Batch (t): 2.621, 1558.78/s, 779.390/s/gpu LR: 0.000095 Logit Scale: 15.691 Contrastive_loss: 5.5672 (6.2214) Loss: 5.5672 (6.2214)
+2024-09-07,09:05:08 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.566 Batch (t): 2.618, 1566.44/s, 783.219/s/gpu LR: 0.000072 Logit Scale: 15.787 Contrastive_loss: 5.9069 (6.1765) Loss: 5.9069 (6.1765)
+2024-09-07,09:06:18 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.562 Batch (t): 2.614, 1575.77/s, 787.886/s/gpu LR: 0.000067 Logit Scale: 15.805 Contrastive_loss: 5.5870 (6.1028) Loss: 5.5870 (6.1028)
+2024-09-07,09:06:21 | INFO | Start epoch 4
+2024-09-07,09:06:33 | INFO | Train Epoch: 4 [   4096/2572288 (0%)] Data (t): 9.714 Batch (t): 11.763, 348.210/s, 174.105/s/gpu LR: 0.000067 Logit Scale: 15.806 Contrastive_loss: 5.4202 (5.4202) Loss: 5.4202 (5.4202)
+2024-09-07,09:10:53 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.549 Batch (t): 2.607, 1561.80/s, 780.900/s/gpu LR: 0.000048 Logit Scale: 15.859 Contrastive_loss: 6.5509 (5.9855) Loss: 6.5509 (5.9855)
+2024-09-07,09:15:15 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.615, 1561.51/s, 780.753/s/gpu LR: 0.000032 Logit Scale: 15.902 Contrastive_loss: 5.7273 (5.8995) Loss: 5.7273 (5.8995)
+2024-09-07,09:19:37 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.618, 1571.58/s, 785.789/s/gpu LR: 0.000019 Logit Scale: 15.925 Contrastive_loss: 6.0029 (5.9253) Loss: 6.0029 (5.9253)
+2024-09-07,09:23:58 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.615, 1567.93/s, 783.965/s/gpu LR: 0.000009 Logit Scale: 15.935 Contrastive_loss: 5.0497 (5.7502) Loss: 5.0497 (5.7502)
+2024-09-07,09:28:20 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.616, 1563.19/s, 781.593/s/gpu LR: 0.000003 Logit Scale: 15.940 Contrastive_loss: 5.5695 (5.7201) Loss: 5.5695 (5.7201)
+2024-09-07,09:32:42 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.565 Batch (t): 2.616, 1563.88/s, 781.939/s/gpu LR: 0.000000 Logit Scale: 15.942 Contrastive_loss: 5.5382 (5.6941) Loss: 5.5382 (5.6941)
+2024-09-07,09:33:52 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.558 Batch (t): 2.610, 1579.28/s, 789.638/s/gpu LR: 0.000000 Logit Scale: 15.942 Contrastive_loss: 5.7904 (5.7061) Loss: 5.7904 (5.7061)

breaking_0.3_trained/30_most_difficult/params.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+accum_freq: 1
+aug_cfg: {}
+batch_size: 2048
+beta1: 0.9
+beta2: 0.98
+checkpoint_path: /home/breaking_0.3_trained/30_most_difficult/checkpoints
+coca_caption_loss_weight: 2.0
+coca_contrastive_loss_weight: 1.0
+copy_codebase: False
+csv_caption_key: title
+csv_img_key: filepath
+csv_separator:
+dataset_resampled: True
+dataset_type: webdataset
+ddp_static_graph: True
+debug: False
+delete_previous_checkpoint: False
+device: cuda:0
+dist_backend: nccl
+dist_url: env://
+distill: False
+distill_model: None
+distill_pretrained: None
+distributed: True
+epochs: 5
+epochs_cooldown: None
+eps: 1e-06
+force_custom_text: False
+force_image_size: None
+force_patch_dropout: None
+force_quick_gelu: False
+gather_with_grad: True
+grad_checkpointing: True
+grad_clip_norm: None
+horovod: False
+image_mean: None
+image_std: None
+imagenet_v2: None
+imagenet_val: None
+local_loss: True
+local_rank: 0
+lock_image: False
+lock_image_freeze_bn_stats: False
+lock_image_unlocked_groups: 0
+lock_text: False
+lock_text_freeze_layer_norm: False
+lock_text_unlocked_layers: 0
+log_every_n_steps: 100
+log_level: 20
+log_local: False
+log_path: /home/breaking_0.3_trained/30_most_difficult/out.log
+logs: /home/breaking_0.3_trained
+lr: 0.0005
+lr_cooldown_end: 0.0
+lr_cooldown_power: 1.0
+lr_scheduler: cosine
+model: ViT-B-32
+name: 30_most_difficult
+no_set_device_rank: False
+precision: amp
+pretrained:
+pretrained_image: False
+rank: 0
+remote_sync: None
+remote_sync_frequency: 300
+remote_sync_protocol: s3
+report_to: wandb
+resume: None
+save_frequency: 0
+save_most_recent: True
+seed: 0
+skip_scheduler: False
+tensorboard: False
+tensorboard_path:
+torchscript: False
+trace: False
+train_data: /home/breaking_0.3/{00000000..00000335}.tar
+train_data_upsampling_factors: None
+train_num_samples: 2560000
+use_bn_sync: False
+val_data: None
+val_frequency: 1
+val_num_samples: None
+wandb: True
+wandb_notes:
+wandb_project_name: clip_text_hq_clusters
+warmup: 500
+wd: 0.2
+workers: 4
+world_size: 2
+zeroshot_frequency: 2

breaking_0.7_trained/70_most_difficult/checkpoints/epoch_5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb482d574f3d17eb109f38c88aa44c6056e5419572e1314cd9a141eac31ad0d
+size 1815701601

breaking_0.7_trained/70_most_difficult/checkpoints/epoch_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6c9016a6da6cf6eaa7d3877aacabb7bf98f490fa87f0a510e65242347cd4fef
+size 1815639289

breaking_0.7_trained/70_most_difficult/info.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c48f712fec688cd15e54a48e3189fb86c429a8f964036f18b0b9edcaa9934fe6
+size 321

breaking_0.7_trained/70_most_difficult/out.log ADDED Viewed

	@@ -0,0 +1,497 @@

+2024-09-07,10:00:53 | INFO | No latest resume checkpoint found in /home/breaking_0.7_trained/70_most_difficult/checkpoints.
+2024-09-07,10:00:55 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
+2024-09-07,10:00:55 | INFO | Loaded ViT-B-32 model config.
+2024-09-07,10:00:56 | INFO | Model:
+2024-09-07,10:00:56 | INFO | CLIP(
+  (visual): VisionTransformer(
+    (patchnorm_pre_ln): Identity()
+    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
+    (patch_dropout): Identity()
+    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+    (transformer): Transformer(
+      (resblocks): ModuleList(
+        (0): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (1): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (2): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (3): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (4): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (5): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (6): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (7): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (8): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (9): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (10): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (11): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+      )
+    )
+    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+  )
+  (transformer): Transformer(
+    (resblocks): ModuleList(
+      (0): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (1): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (2): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (3): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (4): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (5): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (6): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (7): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (8): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (9): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (10): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (11): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+    )
+  )
+  (token_embedding): Embedding(49408, 512)
+  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+)
+2024-09-07,10:00:56 | INFO | Params:
+2024-09-07,10:00:56 | INFO |   accum_freq: 1
+2024-09-07,10:00:56 | INFO |   aug_cfg: {}
+2024-09-07,10:00:56 | INFO |   batch_size: 2048
+2024-09-07,10:00:56 | INFO |   beta1: 0.9
+2024-09-07,10:00:56 | INFO |   beta2: 0.98
+2024-09-07,10:00:56 | INFO |   checkpoint_path: /home/breaking_0.7_trained/70_most_difficult/checkpoints
+2024-09-07,10:00:56 | INFO |   coca_caption_loss_weight: 2.0
+2024-09-07,10:00:56 | INFO |   coca_contrastive_loss_weight: 1.0
+2024-09-07,10:00:56 | INFO |   copy_codebase: False
+2024-09-07,10:00:56 | INFO |   csv_caption_key: title
+2024-09-07,10:00:56 | INFO |   csv_img_key: filepath
+2024-09-07,10:00:56 | INFO |   csv_separator:
+2024-09-07,10:00:56 | INFO |   dataset_resampled: True
+2024-09-07,10:00:56 | INFO |   dataset_type: webdataset
+2024-09-07,10:00:56 | INFO |   ddp_static_graph: True
+2024-09-07,10:00:56 | INFO |   debug: False
+2024-09-07,10:00:56 | INFO |   delete_previous_checkpoint: False
+2024-09-07,10:00:56 | INFO |   device: cuda:0
+2024-09-07,10:00:56 | INFO |   dist_backend: nccl
+2024-09-07,10:00:56 | INFO |   dist_url: env://
+2024-09-07,10:00:56 | INFO |   distill: False
+2024-09-07,10:00:56 | INFO |   distill_model: None
+2024-09-07,10:00:56 | INFO |   distill_pretrained: None
+2024-09-07,10:00:56 | INFO |   distributed: True
+2024-09-07,10:00:56 | INFO |   epochs: 5
+2024-09-07,10:00:56 | INFO |   epochs_cooldown: None
+2024-09-07,10:00:56 | INFO |   eps: 1e-06
+2024-09-07,10:00:56 | INFO |   force_custom_text: False
+2024-09-07,10:00:56 | INFO |   force_image_size: None
+2024-09-07,10:00:56 | INFO |   force_patch_dropout: None
+2024-09-07,10:00:56 | INFO |   force_quick_gelu: False
+2024-09-07,10:00:56 | INFO |   gather_with_grad: True
+2024-09-07,10:00:56 | INFO |   grad_checkpointing: True
+2024-09-07,10:00:56 | INFO |   grad_clip_norm: None
+2024-09-07,10:00:56 | INFO |   horovod: False
+2024-09-07,10:00:56 | INFO |   image_mean: None
+2024-09-07,10:00:56 | INFO |   image_std: None
+2024-09-07,10:00:56 | INFO |   imagenet_v2: None
+2024-09-07,10:00:56 | INFO |   imagenet_val: None
+2024-09-07,10:00:56 | INFO |   local_loss: True
+2024-09-07,10:00:56 | INFO |   local_rank: 0
+2024-09-07,10:00:56 | INFO |   lock_image: False
+2024-09-07,10:00:56 | INFO |   lock_image_freeze_bn_stats: False
+2024-09-07,10:00:56 | INFO |   lock_image_unlocked_groups: 0
+2024-09-07,10:00:56 | INFO |   lock_text: False
+2024-09-07,10:00:56 | INFO |   lock_text_freeze_layer_norm: False
+2024-09-07,10:00:56 | INFO |   lock_text_unlocked_layers: 0
+2024-09-07,10:00:56 | INFO |   log_every_n_steps: 100
+2024-09-07,10:00:56 | INFO |   log_level: 20
+2024-09-07,10:00:56 | INFO |   log_local: False
+2024-09-07,10:00:56 | INFO |   log_path: /home/breaking_0.7_trained/70_most_difficult/out.log
+2024-09-07,10:00:56 | INFO |   logs: /home/breaking_0.7_trained
+2024-09-07,10:00:56 | INFO |   lr: 0.0005
+2024-09-07,10:00:56 | INFO |   lr_cooldown_end: 0.0
+2024-09-07,10:00:56 | INFO |   lr_cooldown_power: 1.0
+2024-09-07,10:00:56 | INFO |   lr_scheduler: cosine
+2024-09-07,10:00:56 | INFO |   model: ViT-B-32
+2024-09-07,10:00:56 | INFO |   name: 70_most_difficult
+2024-09-07,10:00:56 | INFO |   no_set_device_rank: False
+2024-09-07,10:00:56 | INFO |   precision: amp
+2024-09-07,10:00:56 | INFO |   pretrained:
+2024-09-07,10:00:56 | INFO |   pretrained_image: False
+2024-09-07,10:00:56 | INFO |   rank: 0
+2024-09-07,10:00:56 | INFO |   remote_sync: None
+2024-09-07,10:00:56 | INFO |   remote_sync_frequency: 300
+2024-09-07,10:00:56 | INFO |   remote_sync_protocol: s3
+2024-09-07,10:00:56 | INFO |   report_to: wandb
+2024-09-07,10:00:56 | INFO |   resume: None
+2024-09-07,10:00:56 | INFO |   save_frequency: 0
+2024-09-07,10:00:56 | INFO |   save_most_recent: True
+2024-09-07,10:00:56 | INFO |   seed: 0
+2024-09-07,10:00:56 | INFO |   skip_scheduler: False
+2024-09-07,10:00:56 | INFO |   tensorboard: False
+2024-09-07,10:00:56 | INFO |   tensorboard_path:
+2024-09-07,10:00:56 | INFO |   torchscript: False
+2024-09-07,10:00:56 | INFO |   trace: False
+2024-09-07,10:00:56 | INFO |   train_data: /home/breaking_0.7/{00000000..00000763}.tar
+2024-09-07,10:00:56 | INFO |   train_data_upsampling_factors: None
+2024-09-07,10:00:56 | INFO |   train_num_samples: 2560000
+2024-09-07,10:00:56 | INFO |   use_bn_sync: False
+2024-09-07,10:00:56 | INFO |   val_data: None
+2024-09-07,10:00:56 | INFO |   val_frequency: 1
+2024-09-07,10:00:56 | INFO |   val_num_samples: None
+2024-09-07,10:00:56 | INFO |   wandb: True
+2024-09-07,10:00:56 | INFO |   wandb_notes:
+2024-09-07,10:00:56 | INFO |   wandb_project_name: clip_text_hq_clusters
+2024-09-07,10:00:56 | INFO |   warmup: 500
+2024-09-07,10:00:56 | INFO |   wd: 0.2
+2024-09-07,10:00:56 | INFO |   workers: 4
+2024-09-07,10:00:56 | INFO |   world_size: 2
+2024-09-07,10:00:56 | INFO |   zeroshot_frequency: 2
+2024-09-07,10:01:02 | INFO | Start epoch 0
+2024-09-07,10:01:19 | INFO | Train Epoch: 0 [   4096/2572288 (0%)] Data (t): 12.092 Batch (t): 16.765, 244.323/s, 122.161/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3788 (8.3788) Loss: 8.3788 (8.3788)
+2024-09-07,10:01:22 | INFO | Reducer buckets have been rebuilt in this iteration.
+2024-09-07,10:05:39 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.547 Batch (t): 2.602, 1574.29/s, 787.144/s/gpu LR: 0.000101 Logit Scale: 14.266 Contrastive_loss: 8.2004 (8.2896) Loss: 8.2004 (8.2896)
+2024-09-07,10:10:01 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.614, 1566.47/s, 783.235/s/gpu LR: 0.000201 Logit Scale: 14.231 Contrastive_loss: 8.0909 (8.2234) Loss: 8.0909 (8.2234)
+2024-09-07,10:14:22 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.566 Batch (t): 2.616, 1572.98/s, 786.491/s/gpu LR: 0.000301 Logit Scale: 14.194 Contrastive_loss: 7.9780 (8.1620) Loss: 7.9780 (8.1620)
+2024-09-07,10:18:44 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.564 Batch (t): 2.615, 1563.77/s, 781.886/s/gpu LR: 0.000401 Logit Scale: 14.152 Contrastive_loss: 7.8635 (8.1023) Loss: 7.8635 (8.1023)
+2024-09-07,10:23:06 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.565 Batch (t): 2.616, 1563.40/s, 781.699/s/gpu LR: 0.000500 Logit Scale: 14.108 Contrastive_loss: 7.8084 (8.0534) Loss: 7.8084 (8.0534)
+2024-09-07,10:27:28 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.573 Batch (t): 2.623, 1561.55/s, 780.775/s/gpu LR: 0.000498 Logit Scale: 14.085 Contrastive_loss: 7.7462 (8.0095) Loss: 7.7462 (8.0095)
+2024-09-07,10:28:38 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.566 Batch (t): 2.617, 1568.93/s, 784.465/s/gpu LR: 0.000497 Logit Scale: 14.085 Contrastive_loss: 7.6566 (7.9654) Loss: 7.6566 (7.9654)
+2024-09-07,10:28:41 | INFO | Start epoch 1
+2024-09-07,10:28:53 | INFO | Train Epoch: 1 [   4096/2572288 (0%)] Data (t): 9.646 Batch (t): 11.692, 350.330/s, 175.165/s/gpu LR: 0.000497 Logit Scale: 14.085 Contrastive_loss: 7.6610 (7.6610) Loss: 7.6610 (7.6610)
+2024-09-07,10:33:13 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.548 Batch (t): 2.606, 1563.83/s, 781.915/s/gpu LR: 0.000491 Logit Scale: 14.097 Contrastive_loss: 7.5724 (7.6167) Loss: 7.5724 (7.6167)
+2024-09-07,10:37:35 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.565 Batch (t): 2.615, 1566.09/s, 783.044/s/gpu LR: 0.000481 Logit Scale: 14.127 Contrastive_loss: 7.4356 (7.5563) Loss: 7.4356 (7.5563)
+2024-09-07,10:41:56 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.566 Batch (t): 2.616, 1558.96/s, 779.482/s/gpu LR: 0.000468 Logit Scale: 14.170 Contrastive_loss: 7.3573 (7.5066) Loss: 7.3573 (7.5066)
+2024-09-07,10:46:18 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.568 Batch (t): 2.620, 1562.54/s, 781.271/s/gpu LR: 0.000452 Logit Scale: 14.245 Contrastive_loss: 7.4000 (7.4853) Loss: 7.4000 (7.4853)
+2024-09-07,10:50:41 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.571 Batch (t): 2.622, 1563.27/s, 781.634/s/gpu LR: 0.000433 Logit Scale: 14.335 Contrastive_loss: 7.2466 (7.4455) Loss: 7.2466 (7.4455)
+2024-09-07,10:55:03 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.570 Batch (t): 2.620, 1566.36/s, 783.178/s/gpu LR: 0.000412 Logit Scale: 14.443 Contrastive_loss: 7.2259 (7.4141) Loss: 7.2259 (7.4141)
+2024-09-07,10:56:13 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.614, 1574.96/s, 787.481/s/gpu LR: 0.000406 Logit Scale: 14.478 Contrastive_loss: 7.1533 (7.3815) Loss: 7.1533 (7.3815)
+2024-09-07,10:56:16 | INFO | Start epoch 2
+2024-09-07,10:56:28 | INFO | Train Epoch: 2 [   4096/2572288 (0%)] Data (t): 9.651 Batch (t): 11.699, 350.112/s, 175.056/s/gpu LR: 0.000405 Logit Scale: 14.480 Contrastive_loss: 6.9992 (6.9992) Loss: 6.9992 (6.9992)
+2024-09-07,11:00:49 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.558 Batch (t): 2.617, 1562.06/s, 781.032/s/gpu LR: 0.000381 Logit Scale: 14.608 Contrastive_loss: 7.1339 (7.0665) Loss: 7.1339 (7.0665)
+2024-09-07,11:05:12 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.569 Batch (t): 2.622, 1557.91/s, 778.953/s/gpu LR: 0.000355 Logit Scale: 14.768 Contrastive_loss: 7.0686 (7.0672) Loss: 7.0686 (7.0672)
+2024-09-07,11:09:34 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.570 Batch (t): 2.623, 1565.95/s, 782.973/s/gpu LR: 0.000327 Logit Scale: 14.891 Contrastive_loss: 6.9274 (7.0323) Loss: 6.9274 (7.0323)
+2024-09-07,11:13:56 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.569 Batch (t): 2.621, 1563.01/s, 781.503/s/gpu LR: 0.000298 Logit Scale: 15.027 Contrastive_loss: 6.8516 (6.9961) Loss: 6.8516 (6.9961)
+2024-09-07,11:18:18 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.568 Batch (t): 2.620, 1559.58/s, 779.792/s/gpu LR: 0.000269 Logit Scale: 15.198 Contrastive_loss: 6.9052 (6.9810) Loss: 6.9052 (6.9810)
+2024-09-07,11:22:40 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.568 Batch (t): 2.621, 1561.78/s, 780.890/s/gpu LR: 0.000239 Logit Scale: 15.340 Contrastive_loss: 6.7375 (6.9462) Loss: 6.7375 (6.9462)
+2024-09-07,11:23:51 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.567 Batch (t): 2.619, 1575.79/s, 787.897/s/gpu LR: 0.000231 Logit Scale: 15.374 Contrastive_loss: 6.8204 (6.9305) Loss: 6.8204 (6.9305)
+2024-09-07,11:23:54 | INFO | Start epoch 3
+2024-09-07,11:24:05 | INFO | Train Epoch: 3 [   4096/2572288 (0%)] Data (t): 9.380 Batch (t): 11.428, 358.426/s, 179.213/s/gpu LR: 0.000231 Logit Scale: 15.375 Contrastive_loss: 6.6847 (6.6847) Loss: 6.6847 (6.6847)
+2024-09-07,11:28:26 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.552 Batch (t): 2.613, 1562.80/s, 781.399/s/gpu LR: 0.000202 Logit Scale: 15.524 Contrastive_loss: 6.5905 (6.6376) Loss: 6.5905 (6.6376)
+2024-09-07,11:32:48 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.617, 1563.18/s, 781.589/s/gpu LR: 0.000173 Logit Scale: 15.666 Contrastive_loss: 6.5036 (6.5929) Loss: 6.5036 (6.5929)
+2024-09-07,11:37:10 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.568 Batch (t): 2.620, 1562.99/s, 781.497/s/gpu LR: 0.000145 Logit Scale: 15.784 Contrastive_loss: 6.2833 (6.5155) Loss: 6.2833 (6.5155)
+2024-09-07,11:41:32 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.618, 1568.86/s, 784.430/s/gpu LR: 0.000119 Logit Scale: 15.895 Contrastive_loss: 6.2988 (6.4722) Loss: 6.2988 (6.4722)
+2024-09-07,11:45:54 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.564 Batch (t): 2.617, 1566.40/s, 783.198/s/gpu LR: 0.000095 Logit Scale: 16.002 Contrastive_loss: 6.3952 (6.4594) Loss: 6.3952 (6.4594)
+2024-09-07,11:50:16 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.566 Batch (t): 2.619, 1568.68/s, 784.342/s/gpu LR: 0.000072 Logit Scale: 16.096 Contrastive_loss: 6.1727 (6.4184) Loss: 6.1727 (6.4184)
+2024-09-07,11:51:26 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.567 Batch (t): 2.619, 1575.88/s, 787.941/s/gpu LR: 0.000067 Logit Scale: 16.118 Contrastive_loss: 6.3202 (6.4061) Loss: 6.3202 (6.4061)
+2024-09-07,11:51:29 | INFO | Start epoch 4
+2024-09-07,11:51:41 | INFO | Train Epoch: 4 [   4096/2572288 (0%)] Data (t): 9.530 Batch (t): 11.577, 353.813/s, 176.906/s/gpu LR: 0.000067 Logit Scale: 16.118 Contrastive_loss: 6.2788 (6.2788) Loss: 6.2788 (6.2788)
+2024-09-07,11:56:02 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.554 Batch (t): 2.615, 1562.67/s, 781.334/s/gpu LR: 0.000048 Logit Scale: 16.177 Contrastive_loss: 6.4553 (6.3671) Loss: 6.4553 (6.3671)
+2024-09-07,12:00:24 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.563 Batch (t): 2.616, 1563.44/s, 781.721/s/gpu LR: 0.000032 Logit Scale: 16.220 Contrastive_loss: 6.3225 (6.3522) Loss: 6.3225 (6.3522)
+2024-09-07,12:04:45 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.563 Batch (t): 2.615, 1567.49/s, 783.743/s/gpu LR: 0.000019 Logit Scale: 16.246 Contrastive_loss: 6.2155 (6.3180) Loss: 6.2155 (6.3180)
+2024-09-07,12:09:07 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.565 Batch (t): 2.619, 1563.51/s, 781.753/s/gpu LR: 0.000009 Logit Scale: 16.261 Contrastive_loss: 6.4286 (6.3401) Loss: 6.4286 (6.3401)
+2024-09-07,12:13:29 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.563 Batch (t): 2.616, 1565.01/s, 782.504/s/gpu LR: 0.000003 Logit Scale: 16.267 Contrastive_loss: 5.7948 (6.2493) Loss: 5.7948 (6.2493)
+2024-09-07,12:17:51 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.564 Batch (t): 2.618, 1563.70/s, 781.850/s/gpu LR: 0.000000 Logit Scale: 16.268 Contrastive_loss: 6.4187 (6.2735) Loss: 6.4187 (6.2735)
+2024-09-07,12:19:01 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.563 Batch (t): 2.615, 1572.01/s, 786.003/s/gpu LR: 0.000000 Logit Scale: 16.268 Contrastive_loss: 6.3365 (6.2813) Loss: 6.3365 (6.2813)

breaking_0.7_trained/70_most_difficult/params.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+accum_freq: 1
+aug_cfg: {}
+batch_size: 2048
+beta1: 0.9
+beta2: 0.98
+checkpoint_path: /home/breaking_0.7_trained/70_most_difficult/checkpoints
+coca_caption_loss_weight: 2.0
+coca_contrastive_loss_weight: 1.0
+copy_codebase: False
+csv_caption_key: title
+csv_img_key: filepath
+csv_separator:
+dataset_resampled: True
+dataset_type: webdataset
+ddp_static_graph: True
+debug: False
+delete_previous_checkpoint: False
+device: cuda:0
+dist_backend: nccl
+dist_url: env://
+distill: False
+distill_model: None
+distill_pretrained: None
+distributed: True
+epochs: 5
+epochs_cooldown: None
+eps: 1e-06
+force_custom_text: False
+force_image_size: None
+force_patch_dropout: None
+force_quick_gelu: False
+gather_with_grad: True
+grad_checkpointing: True
+grad_clip_norm: None
+horovod: False
+image_mean: None
+image_std: None
+imagenet_v2: None
+imagenet_val: None
+local_loss: True
+local_rank: 0
+lock_image: False
+lock_image_freeze_bn_stats: False
+lock_image_unlocked_groups: 0
+lock_text: False
+lock_text_freeze_layer_norm: False
+lock_text_unlocked_layers: 0
+log_every_n_steps: 100
+log_level: 20
+log_local: False
+log_path: /home/breaking_0.7_trained/70_most_difficult/out.log
+logs: /home/breaking_0.7_trained
+lr: 0.0005
+lr_cooldown_end: 0.0
+lr_cooldown_power: 1.0
+lr_scheduler: cosine
+model: ViT-B-32
+name: 70_most_difficult
+no_set_device_rank: False
+precision: amp
+pretrained:
+pretrained_image: False
+rank: 0
+remote_sync: None
+remote_sync_frequency: 300
+remote_sync_protocol: s3
+report_to: wandb
+resume: None
+save_frequency: 0
+save_most_recent: True
+seed: 0
+skip_scheduler: False
+tensorboard: False
+tensorboard_path:
+torchscript: False
+trace: False
+train_data: /home/breaking_0.7/{00000000..00000763}.tar
+train_data_upsampling_factors: None
+train_num_samples: 2560000
+use_bn_sync: False
+val_data: None
+val_frequency: 1
+val_num_samples: None
+wandb: True
+wandb_notes:
+wandb_project_name: clip_text_hq_clusters
+warmup: 500
+wd: 0.2
+workers: 4
+world_size: 2
+zeroshot_frequency: 2

breaking_0.9_trained/90_most_difficult/checkpoints/epoch_5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:385d8998b34593532134cb9e27569b8fa26041c7883f44a2879ac5dd5fb63831
+size 1815701601

breaking_0.9_trained/90_most_difficult/checkpoints/epoch_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf4bcef741c6277c38eda5a8a2b606b7790be2bf823e8e2570387b786a6a4b07
+size 1815639289

breaking_0.9_trained/90_most_difficult/info.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:497529ead8ce8bbef29d5773f843ad88a6775e7828bc7bf21a40a5acae703618
+size 321

breaking_0.9_trained/90_most_difficult/out.log ADDED Viewed

	@@ -0,0 +1,497 @@

+2024-09-07,12:35:40 | INFO | No latest resume checkpoint found in /home/breaking_0.9_trained/90_most_difficult/checkpoints.
+2024-09-07,12:35:41 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
+2024-09-07,12:35:41 | INFO | Loaded ViT-B-32 model config.
+2024-09-07,12:35:42 | INFO | Model:
+2024-09-07,12:35:42 | INFO | CLIP(
+  (visual): VisionTransformer(
+    (patchnorm_pre_ln): Identity()
+    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
+    (patch_dropout): Identity()
+    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+    (transformer): Transformer(
+      (resblocks): ModuleList(
+        (0): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (1): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (2): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (3): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (4): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (5): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (6): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (7): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (8): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (9): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (10): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+        (11): ResidualAttentionBlock(
+          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (attn): MultiheadAttention(
+            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
+          )
+          (ls_1): Identity()
+          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+          (mlp): Sequential(
+            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
+            (gelu): GELU(approximate='none')
+            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
+          )
+          (ls_2): Identity()
+        )
+      )
+    )
+    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+  )
+  (transformer): Transformer(
+    (resblocks): ModuleList(
+      (0): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (1): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (2): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (3): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (4): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (5): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (6): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (7): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (8): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (9): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (10): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+      (11): ResidualAttentionBlock(
+        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (attn): MultiheadAttention(
+          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
+        )
+        (ls_1): Identity()
+        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+        (mlp): Sequential(
+          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
+          (gelu): GELU(approximate='none')
+          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
+        )
+        (ls_2): Identity()
+      )
+    )
+  )
+  (token_embedding): Embedding(49408, 512)
+  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
+)
+2024-09-07,12:35:42 | INFO | Params:
+2024-09-07,12:35:42 | INFO |   accum_freq: 1
+2024-09-07,12:35:42 | INFO |   aug_cfg: {}
+2024-09-07,12:35:42 | INFO |   batch_size: 2048
+2024-09-07,12:35:42 | INFO |   beta1: 0.9
+2024-09-07,12:35:42 | INFO |   beta2: 0.98
+2024-09-07,12:35:42 | INFO |   checkpoint_path: /home/breaking_0.9_trained/90_most_difficult/checkpoints
+2024-09-07,12:35:42 | INFO |   coca_caption_loss_weight: 2.0
+2024-09-07,12:35:42 | INFO |   coca_contrastive_loss_weight: 1.0
+2024-09-07,12:35:42 | INFO |   copy_codebase: False
+2024-09-07,12:35:42 | INFO |   csv_caption_key: title
+2024-09-07,12:35:42 | INFO |   csv_img_key: filepath
+2024-09-07,12:35:42 | INFO |   csv_separator:
+2024-09-07,12:35:42 | INFO |   dataset_resampled: True
+2024-09-07,12:35:42 | INFO |   dataset_type: webdataset
+2024-09-07,12:35:42 | INFO |   ddp_static_graph: True
+2024-09-07,12:35:42 | INFO |   debug: False
+2024-09-07,12:35:42 | INFO |   delete_previous_checkpoint: False
+2024-09-07,12:35:42 | INFO |   device: cuda:0
+2024-09-07,12:35:42 | INFO |   dist_backend: nccl
+2024-09-07,12:35:42 | INFO |   dist_url: env://
+2024-09-07,12:35:42 | INFO |   distill: False
+2024-09-07,12:35:42 | INFO |   distill_model: None
+2024-09-07,12:35:42 | INFO |   distill_pretrained: None
+2024-09-07,12:35:42 | INFO |   distributed: True
+2024-09-07,12:35:42 | INFO |   epochs: 5
+2024-09-07,12:35:42 | INFO |   epochs_cooldown: None
+2024-09-07,12:35:42 | INFO |   eps: 1e-06
+2024-09-07,12:35:42 | INFO |   force_custom_text: False
+2024-09-07,12:35:42 | INFO |   force_image_size: None
+2024-09-07,12:35:42 | INFO |   force_patch_dropout: None
+2024-09-07,12:35:42 | INFO |   force_quick_gelu: False
+2024-09-07,12:35:42 | INFO |   gather_with_grad: True
+2024-09-07,12:35:42 | INFO |   grad_checkpointing: True
+2024-09-07,12:35:42 | INFO |   grad_clip_norm: None
+2024-09-07,12:35:42 | INFO |   horovod: False
+2024-09-07,12:35:42 | INFO |   image_mean: None
+2024-09-07,12:35:42 | INFO |   image_std: None
+2024-09-07,12:35:42 | INFO |   imagenet_v2: None
+2024-09-07,12:35:42 | INFO |   imagenet_val: None
+2024-09-07,12:35:42 | INFO |   local_loss: True
+2024-09-07,12:35:42 | INFO |   local_rank: 0
+2024-09-07,12:35:42 | INFO |   lock_image: False
+2024-09-07,12:35:42 | INFO |   lock_image_freeze_bn_stats: False
+2024-09-07,12:35:42 | INFO |   lock_image_unlocked_groups: 0
+2024-09-07,12:35:42 | INFO |   lock_text: False
+2024-09-07,12:35:42 | INFO |   lock_text_freeze_layer_norm: False
+2024-09-07,12:35:42 | INFO |   lock_text_unlocked_layers: 0
+2024-09-07,12:35:42 | INFO |   log_every_n_steps: 100
+2024-09-07,12:35:42 | INFO |   log_level: 20
+2024-09-07,12:35:42 | INFO |   log_local: False
+2024-09-07,12:35:42 | INFO |   log_path: /home/breaking_0.9_trained/90_most_difficult/out.log
+2024-09-07,12:35:42 | INFO |   logs: /home/breaking_0.9_trained
+2024-09-07,12:35:42 | INFO |   lr: 0.0005
+2024-09-07,12:35:42 | INFO |   lr_cooldown_end: 0.0
+2024-09-07,12:35:42 | INFO |   lr_cooldown_power: 1.0
+2024-09-07,12:35:42 | INFO |   lr_scheduler: cosine
+2024-09-07,12:35:42 | INFO |   model: ViT-B-32
+2024-09-07,12:35:42 | INFO |   name: 90_most_difficult
+2024-09-07,12:35:42 | INFO |   no_set_device_rank: False
+2024-09-07,12:35:42 | INFO |   precision: amp
+2024-09-07,12:35:42 | INFO |   pretrained:
+2024-09-07,12:35:42 | INFO |   pretrained_image: False
+2024-09-07,12:35:42 | INFO |   rank: 0
+2024-09-07,12:35:42 | INFO |   remote_sync: None
+2024-09-07,12:35:42 | INFO |   remote_sync_frequency: 300
+2024-09-07,12:35:42 | INFO |   remote_sync_protocol: s3
+2024-09-07,12:35:42 | INFO |   report_to: wandb
+2024-09-07,12:35:42 | INFO |   resume: None
+2024-09-07,12:35:42 | INFO |   save_frequency: 0
+2024-09-07,12:35:42 | INFO |   save_most_recent: True
+2024-09-07,12:35:42 | INFO |   seed: 0
+2024-09-07,12:35:42 | INFO |   skip_scheduler: False
+2024-09-07,12:35:42 | INFO |   tensorboard: False
+2024-09-07,12:35:42 | INFO |   tensorboard_path:
+2024-09-07,12:35:42 | INFO |   torchscript: False
+2024-09-07,12:35:42 | INFO |   trace: False
+2024-09-07,12:35:42 | INFO |   train_data: /home/breaking_0.9/{00000000..00000962}.tar
+2024-09-07,12:35:42 | INFO |   train_data_upsampling_factors: None
+2024-09-07,12:35:42 | INFO |   train_num_samples: 2560000
+2024-09-07,12:35:42 | INFO |   use_bn_sync: False
+2024-09-07,12:35:42 | INFO |   val_data: None
+2024-09-07,12:35:42 | INFO |   val_frequency: 1
+2024-09-07,12:35:42 | INFO |   val_num_samples: None
+2024-09-07,12:35:42 | INFO |   wandb: True
+2024-09-07,12:35:42 | INFO |   wandb_notes:
+2024-09-07,12:35:42 | INFO |   wandb_project_name: clip_text_hq_clusters
+2024-09-07,12:35:42 | INFO |   warmup: 500
+2024-09-07,12:35:42 | INFO |   wd: 0.2
+2024-09-07,12:35:42 | INFO |   workers: 4
+2024-09-07,12:35:42 | INFO |   world_size: 2
+2024-09-07,12:35:42 | INFO |   zeroshot_frequency: 2
+2024-09-07,12:35:50 | INFO | Start epoch 0
+2024-09-07,12:36:06 | INFO | Train Epoch: 0 [   4096/2572288 (0%)] Data (t): 12.100 Batch (t): 16.653, 245.968/s, 122.984/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3763 (8.3763) Loss: 8.3763 (8.3763)
+2024-09-07,12:36:10 | INFO | Reducer buckets have been rebuilt in this iteration.
+2024-09-07,12:40:27 | INFO | Train Epoch: 0 [ 413696/2572288 (16%)] Data (t): 0.552 Batch (t): 2.604, 1566.14/s, 783.069/s/gpu LR: 0.000101 Logit Scale: 14.266 Contrastive_loss: 8.1784 (8.2774) Loss: 8.1784 (8.2774)
+2024-09-07,12:44:48 | INFO | Train Epoch: 0 [ 823296/2572288 (32%)] Data (t): 0.564 Batch (t): 2.612, 1563.16/s, 781.581/s/gpu LR: 0.000201 Logit Scale: 14.228 Contrastive_loss: 7.9988 (8.1845) Loss: 7.9988 (8.1845)
+2024-09-07,12:49:09 | INFO | Train Epoch: 0 [1232896/2572288 (48%)] Data (t): 0.563 Batch (t): 2.613, 1564.24/s, 782.121/s/gpu LR: 0.000301 Logit Scale: 14.184 Contrastive_loss: 7.9886 (8.1355) Loss: 7.9886 (8.1355)
+2024-09-07,12:53:31 | INFO | Train Epoch: 0 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.612, 1570.66/s, 785.331/s/gpu LR: 0.000401 Logit Scale: 14.136 Contrastive_loss: 7.8946 (8.0873) Loss: 7.8946 (8.0873)
+2024-09-07,12:57:52 | INFO | Train Epoch: 0 [2052096/2572288 (80%)] Data (t): 0.559 Batch (t): 2.609, 1567.63/s, 783.816/s/gpu LR: 0.000500 Logit Scale: 14.088 Contrastive_loss: 7.8069 (8.0406) Loss: 7.8069 (8.0406)
+2024-09-07,13:02:13 | INFO | Train Epoch: 0 [2461696/2572288 (96%)] Data (t): 0.560 Batch (t): 2.611, 1576.40/s, 788.198/s/gpu LR: 0.000498 Logit Scale: 14.064 Contrastive_loss: 7.7242 (7.9954) Loss: 7.7242 (7.9954)
+2024-09-07,13:03:23 | INFO | Train Epoch: 0 [2572288/2572288 (100%)] Data (t): 0.557 Batch (t): 2.607, 1578.68/s, 789.338/s/gpu LR: 0.000497 Logit Scale: 14.063 Contrastive_loss: 7.6876 (7.9569) Loss: 7.6876 (7.9569)
+2024-09-07,13:03:26 | INFO | Start epoch 1
+2024-09-07,13:03:37 | INFO | Train Epoch: 1 [   4096/2572288 (0%)] Data (t): 9.636 Batch (t): 11.680, 350.674/s, 175.337/s/gpu LR: 0.000497 Logit Scale: 14.063 Contrastive_loss: 7.6917 (7.6917) Loss: 7.6917 (7.6917)
+2024-09-07,13:07:57 | INFO | Train Epoch: 1 [ 413696/2572288 (16%)] Data (t): 0.538 Batch (t): 2.597, 1573.69/s, 786.847/s/gpu LR: 0.000491 Logit Scale: 14.065 Contrastive_loss: 7.6440 (7.6679) Loss: 7.6440 (7.6679)
+2024-09-07,13:12:18 | INFO | Train Epoch: 1 [ 823296/2572288 (32%)] Data (t): 0.557 Batch (t): 2.609, 1572.22/s, 786.112/s/gpu LR: 0.000481 Logit Scale: 14.094 Contrastive_loss: 7.5110 (7.6156) Loss: 7.5110 (7.6156)
+2024-09-07,13:16:39 | INFO | Train Epoch: 1 [1232896/2572288 (48%)] Data (t): 0.557 Batch (t): 2.609, 1571.16/s, 785.581/s/gpu LR: 0.000468 Logit Scale: 14.146 Contrastive_loss: 7.5073 (7.5885) Loss: 7.5073 (7.5885)
+2024-09-07,13:20:59 | INFO | Train Epoch: 1 [1642496/2572288 (64%)] Data (t): 0.557 Batch (t): 2.607, 1575.14/s, 787.570/s/gpu LR: 0.000452 Logit Scale: 14.215 Contrastive_loss: 7.3952 (7.5499) Loss: 7.3952 (7.5499)
+2024-09-07,13:25:20 | INFO | Train Epoch: 1 [2052096/2572288 (80%)] Data (t): 0.560 Batch (t): 2.610, 1553.88/s, 776.941/s/gpu LR: 0.000433 Logit Scale: 14.321 Contrastive_loss: 7.3651 (7.5191) Loss: 7.3651 (7.5191)
+2024-09-07,13:29:42 | INFO | Train Epoch: 1 [2461696/2572288 (96%)] Data (t): 0.560 Batch (t): 2.611, 1564.99/s, 782.493/s/gpu LR: 0.000412 Logit Scale: 14.443 Contrastive_loss: 7.3117 (7.4894) Loss: 7.3117 (7.4894)
+2024-09-07,13:30:52 | INFO | Train Epoch: 1 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.607, 1581.65/s, 790.826/s/gpu LR: 0.000406 Logit Scale: 14.479 Contrastive_loss: 7.1874 (7.4517) Loss: 7.1874 (7.4517)
+2024-09-07,13:30:55 | INFO | Start epoch 2
+2024-09-07,13:31:06 | INFO | Train Epoch: 2 [   4096/2572288 (0%)] Data (t): 9.545 Batch (t): 11.591, 353.370/s, 176.685/s/gpu LR: 0.000405 Logit Scale: 14.481 Contrastive_loss: 7.2456 (7.2456) Loss: 7.2456 (7.2456)
+2024-09-07,13:35:27 | INFO | Train Epoch: 2 [ 413696/2572288 (16%)] Data (t): 0.545 Batch (t): 2.603, 1569.79/s, 784.896/s/gpu LR: 0.000381 Logit Scale: 14.642 Contrastive_loss: 7.2023 (7.2240) Loss: 7.2023 (7.2240)
+2024-09-07,13:39:47 | INFO | Train Epoch: 2 [ 823296/2572288 (32%)] Data (t): 0.559 Batch (t): 2.609, 1571.68/s, 785.841/s/gpu LR: 0.000355 Logit Scale: 14.807 Contrastive_loss: 7.0309 (7.1596) Loss: 7.0309 (7.1596)
+2024-09-07,13:44:09 | INFO | Train Epoch: 2 [1232896/2572288 (48%)] Data (t): 0.562 Batch (t): 2.611, 1565.52/s, 782.758/s/gpu LR: 0.000327 Logit Scale: 14.927 Contrastive_loss: 7.1046 (7.1459) Loss: 7.1046 (7.1459)
+2024-09-07,13:48:30 | INFO | Train Epoch: 2 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.614, 1569.29/s, 784.644/s/gpu LR: 0.000298 Logit Scale: 15.085 Contrastive_loss: 6.8606 (7.0888) Loss: 6.8606 (7.0888)
+2024-09-07,13:52:51 | INFO | Train Epoch: 2 [2052096/2572288 (80%)] Data (t): 0.562 Batch (t): 2.614, 1568.08/s, 784.039/s/gpu LR: 0.000269 Logit Scale: 15.223 Contrastive_loss: 6.8216 (7.0443) Loss: 6.8216 (7.0443)
+2024-09-07,13:57:13 | INFO | Train Epoch: 2 [2461696/2572288 (96%)] Data (t): 0.563 Batch (t): 2.613, 1567.16/s, 783.578/s/gpu LR: 0.000239 Logit Scale: 15.374 Contrastive_loss: 6.6735 (6.9913) Loss: 6.6735 (6.9913)
+2024-09-07,13:58:23 | INFO | Train Epoch: 2 [2572288/2572288 (100%)] Data (t): 0.561 Batch (t): 2.611, 1576.59/s, 788.293/s/gpu LR: 0.000231 Logit Scale: 15.422 Contrastive_loss: 6.7912 (6.9663) Loss: 6.7912 (6.9663)
+2024-09-07,13:58:26 | INFO | Start epoch 3
+2024-09-07,13:58:37 | INFO | Train Epoch: 3 [   4096/2572288 (0%)] Data (t): 9.438 Batch (t): 11.480, 356.781/s, 178.391/s/gpu LR: 0.000231 Logit Scale: 15.423 Contrastive_loss: 6.6685 (6.6685) Loss: 6.6685 (6.6685)
+2024-09-07,14:02:58 | INFO | Train Epoch: 3 [ 413696/2572288 (16%)] Data (t): 0.547 Batch (t): 2.606, 1563.69/s, 781.844/s/gpu LR: 0.000202 Logit Scale: 15.573 Contrastive_loss: 6.7789 (6.7237) Loss: 6.7789 (6.7237)
+2024-09-07,14:07:19 | INFO | Train Epoch: 3 [ 823296/2572288 (32%)] Data (t): 0.560 Batch (t): 2.613, 1571.76/s, 785.878/s/gpu LR: 0.000173 Logit Scale: 15.734 Contrastive_loss: 6.6477 (6.6984) Loss: 6.6477 (6.6984)
+2024-09-07,14:11:41 | INFO | Train Epoch: 3 [1232896/2572288 (48%)] Data (t): 0.561 Batch (t): 2.613, 1567.31/s, 783.654/s/gpu LR: 0.000145 Logit Scale: 15.861 Contrastive_loss: 6.5687 (6.6660) Loss: 6.5687 (6.6660)
+2024-09-07,14:16:02 | INFO | Train Epoch: 3 [1642496/2572288 (64%)] Data (t): 0.563 Batch (t): 2.614, 1571.02/s, 785.509/s/gpu LR: 0.000119 Logit Scale: 15.976 Contrastive_loss: 6.6244 (6.6576) Loss: 6.6244 (6.6576)
+2024-09-07,14:20:23 | INFO | Train Epoch: 3 [2052096/2572288 (80%)] Data (t): 0.562 Batch (t): 2.614, 1569.91/s, 784.953/s/gpu LR: 0.000095 Logit Scale: 16.078 Contrastive_loss: 6.3511 (6.6066) Loss: 6.3511 (6.6066)
+2024-09-07,14:24:45 | INFO | Train Epoch: 3 [2461696/2572288 (96%)] Data (t): 0.565 Batch (t): 2.617, 1565.69/s, 782.846/s/gpu LR: 0.000072 Logit Scale: 16.172 Contrastive_loss: 6.3930 (6.5761) Loss: 6.3930 (6.5761)
+2024-09-07,14:25:56 | INFO | Train Epoch: 3 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.610, 1579.64/s, 789.822/s/gpu LR: 0.000067 Logit Scale: 16.193 Contrastive_loss: 6.6402 (6.5841) Loss: 6.6402 (6.5841)
+2024-09-07,14:25:58 | INFO | Start epoch 4
+2024-09-07,14:26:10 | INFO | Train Epoch: 4 [   4096/2572288 (0%)] Data (t): 9.504 Batch (t): 11.549, 354.649/s, 177.325/s/gpu LR: 0.000067 Logit Scale: 16.193 Contrastive_loss: 6.5566 (6.5566) Loss: 6.5566 (6.5566)
+2024-09-07,14:30:31 | INFO | Train Epoch: 4 [ 413696/2572288 (16%)] Data (t): 0.546 Batch (t): 2.607, 1566.76/s, 783.382/s/gpu LR: 0.000048 Logit Scale: 16.260 Contrastive_loss: 6.4124 (6.4845) Loss: 6.4124 (6.4845)
+2024-09-07,14:34:52 | INFO | Train Epoch: 4 [ 823296/2572288 (32%)] Data (t): 0.563 Batch (t): 2.615, 1552.31/s, 776.154/s/gpu LR: 0.000032 Logit Scale: 16.300 Contrastive_loss: 6.3687 (6.4459) Loss: 6.3687 (6.4459)
+2024-09-07,14:39:14 | INFO | Train Epoch: 4 [1232896/2572288 (48%)] Data (t): 0.563 Batch (t): 2.617, 1567.87/s, 783.936/s/gpu LR: 0.000019 Logit Scale: 16.329 Contrastive_loss: 6.3193 (6.4142) Loss: 6.3193 (6.4142)
+2024-09-07,14:43:35 | INFO | Train Epoch: 4 [1642496/2572288 (64%)] Data (t): 0.561 Batch (t): 2.613, 1567.16/s, 783.580/s/gpu LR: 0.000009 Logit Scale: 16.343 Contrastive_loss: 6.3362 (6.3986) Loss: 6.3362 (6.3986)
+2024-09-07,14:47:57 | INFO | Train Epoch: 4 [2052096/2572288 (80%)] Data (t): 0.566 Batch (t): 2.618, 1566.95/s, 783.475/s/gpu LR: 0.000003 Logit Scale: 16.350 Contrastive_loss: 6.1241 (6.3529) Loss: 6.1241 (6.3529)
+2024-09-07,14:52:19 | INFO | Train Epoch: 4 [2461696/2572288 (96%)] Data (t): 0.564 Batch (t): 2.618, 1557.86/s, 778.931/s/gpu LR: 0.000000 Logit Scale: 16.352 Contrastive_loss: 6.2534 (6.3387) Loss: 6.2534 (6.3387)
+2024-09-07,14:53:29 | INFO | Train Epoch: 4 [2572288/2572288 (100%)] Data (t): 0.559 Batch (t): 2.611, 1577.81/s, 788.906/s/gpu LR: 0.000000 Logit Scale: 16.352 Contrastive_loss: 6.4079 (6.3473) Loss: 6.4079 (6.3473)

breaking_0.9_trained/90_most_difficult/params.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+accum_freq: 1
+aug_cfg: {}
+batch_size: 2048
+beta1: 0.9
+beta2: 0.98
+checkpoint_path: /home/breaking_0.9_trained/90_most_difficult/checkpoints
+coca_caption_loss_weight: 2.0
+coca_contrastive_loss_weight: 1.0
+copy_codebase: False
+csv_caption_key: title
+csv_img_key: filepath
+csv_separator:
+dataset_resampled: True
+dataset_type: webdataset
+ddp_static_graph: True
+debug: False
+delete_previous_checkpoint: False
+device: cuda:0
+dist_backend: nccl
+dist_url: env://
+distill: False
+distill_model: None
+distill_pretrained: None
+distributed: True
+epochs: 5
+epochs_cooldown: None
+eps: 1e-06
+force_custom_text: False
+force_image_size: None
+force_patch_dropout: None
+force_quick_gelu: False
+gather_with_grad: True
+grad_checkpointing: True
+grad_clip_norm: None
+horovod: False
+image_mean: None
+image_std: None
+imagenet_v2: None
+imagenet_val: None
+local_loss: True
+local_rank: 0
+lock_image: False
+lock_image_freeze_bn_stats: False
+lock_image_unlocked_groups: 0
+lock_text: False
+lock_text_freeze_layer_norm: False
+lock_text_unlocked_layers: 0
+log_every_n_steps: 100
+log_level: 20
+log_local: False
+log_path: /home/breaking_0.9_trained/90_most_difficult/out.log
+logs: /home/breaking_0.9_trained
+lr: 0.0005
+lr_cooldown_end: 0.0
+lr_cooldown_power: 1.0
+lr_scheduler: cosine
+model: ViT-B-32
+name: 90_most_difficult
+no_set_device_rank: False
+precision: amp
+pretrained:
+pretrained_image: False
+rank: 0
+remote_sync: None
+remote_sync_frequency: 300
+remote_sync_protocol: s3
+report_to: wandb
+resume: None
+save_frequency: 0
+save_most_recent: True
+seed: 0
+skip_scheduler: False
+tensorboard: False
+tensorboard_path:
+torchscript: False
+trace: False
+train_data: /home/breaking_0.9/{00000000..00000962}.tar
+train_data_upsampling_factors: None
+train_num_samples: 2560000
+use_bn_sync: False
+val_data: None
+val_frequency: 1
+val_num_samples: None
+wandb: True
+wandb_notes:
+wandb_project_name: clip_text_hq_clusters
+warmup: 500
+wd: 0.2
+workers: 4
+world_size: 2
+zeroshot_frequency: 2