Add files using upload-large-folder tool
Browse files
medium/low_inter_only/out.log
ADDED
|
@@ -0,0 +1,492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-29,17:42:33 | INFO | No latest resume checkpoint found in /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/checkpoints.
|
| 2 |
+
2025-04-29,17:42:35 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 3 |
+
2025-04-29,17:42:35 | INFO | Loaded ViT-B-32 model config.
|
| 4 |
+
2025-04-29,17:42:36 | INFO | Model:
|
| 5 |
+
2025-04-29,17:42:36 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(patchnorm_pre_ln): Identity()
|
| 8 |
+
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
|
| 9 |
+
(patch_dropout): Identity()
|
| 10 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 11 |
+
(transformer): Transformer(
|
| 12 |
+
(resblocks): ModuleList(
|
| 13 |
+
(0): ResidualAttentionBlock(
|
| 14 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 15 |
+
(attn): MultiheadAttention(
|
| 16 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 17 |
+
)
|
| 18 |
+
(ls_1): Identity()
|
| 19 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 20 |
+
(mlp): Sequential(
|
| 21 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 22 |
+
(gelu): GELU(approximate='none')
|
| 23 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 24 |
+
)
|
| 25 |
+
(ls_2): Identity()
|
| 26 |
+
)
|
| 27 |
+
(1): ResidualAttentionBlock(
|
| 28 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
(attn): MultiheadAttention(
|
| 30 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 31 |
+
)
|
| 32 |
+
(ls_1): Identity()
|
| 33 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(mlp): Sequential(
|
| 35 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 36 |
+
(gelu): GELU(approximate='none')
|
| 37 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(ls_2): Identity()
|
| 40 |
+
)
|
| 41 |
+
(2): ResidualAttentionBlock(
|
| 42 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 43 |
+
(attn): MultiheadAttention(
|
| 44 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 45 |
+
)
|
| 46 |
+
(ls_1): Identity()
|
| 47 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 48 |
+
(mlp): Sequential(
|
| 49 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 50 |
+
(gelu): GELU(approximate='none')
|
| 51 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 52 |
+
)
|
| 53 |
+
(ls_2): Identity()
|
| 54 |
+
)
|
| 55 |
+
(3): ResidualAttentionBlock(
|
| 56 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(attn): MultiheadAttention(
|
| 58 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 59 |
+
)
|
| 60 |
+
(ls_1): Identity()
|
| 61 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 62 |
+
(mlp): Sequential(
|
| 63 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 64 |
+
(gelu): GELU(approximate='none')
|
| 65 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 66 |
+
)
|
| 67 |
+
(ls_2): Identity()
|
| 68 |
+
)
|
| 69 |
+
(4): ResidualAttentionBlock(
|
| 70 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 71 |
+
(attn): MultiheadAttention(
|
| 72 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 73 |
+
)
|
| 74 |
+
(ls_1): Identity()
|
| 75 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 76 |
+
(mlp): Sequential(
|
| 77 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 78 |
+
(gelu): GELU(approximate='none')
|
| 79 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 80 |
+
)
|
| 81 |
+
(ls_2): Identity()
|
| 82 |
+
)
|
| 83 |
+
(5): ResidualAttentionBlock(
|
| 84 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 85 |
+
(attn): MultiheadAttention(
|
| 86 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 87 |
+
)
|
| 88 |
+
(ls_1): Identity()
|
| 89 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 90 |
+
(mlp): Sequential(
|
| 91 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 92 |
+
(gelu): GELU(approximate='none')
|
| 93 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 94 |
+
)
|
| 95 |
+
(ls_2): Identity()
|
| 96 |
+
)
|
| 97 |
+
(6): ResidualAttentionBlock(
|
| 98 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 99 |
+
(attn): MultiheadAttention(
|
| 100 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 101 |
+
)
|
| 102 |
+
(ls_1): Identity()
|
| 103 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 104 |
+
(mlp): Sequential(
|
| 105 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 106 |
+
(gelu): GELU(approximate='none')
|
| 107 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 108 |
+
)
|
| 109 |
+
(ls_2): Identity()
|
| 110 |
+
)
|
| 111 |
+
(7): ResidualAttentionBlock(
|
| 112 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 113 |
+
(attn): MultiheadAttention(
|
| 114 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 115 |
+
)
|
| 116 |
+
(ls_1): Identity()
|
| 117 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 118 |
+
(mlp): Sequential(
|
| 119 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 120 |
+
(gelu): GELU(approximate='none')
|
| 121 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 122 |
+
)
|
| 123 |
+
(ls_2): Identity()
|
| 124 |
+
)
|
| 125 |
+
(8): ResidualAttentionBlock(
|
| 126 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 127 |
+
(attn): MultiheadAttention(
|
| 128 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 129 |
+
)
|
| 130 |
+
(ls_1): Identity()
|
| 131 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 132 |
+
(mlp): Sequential(
|
| 133 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 134 |
+
(gelu): GELU(approximate='none')
|
| 135 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 136 |
+
)
|
| 137 |
+
(ls_2): Identity()
|
| 138 |
+
)
|
| 139 |
+
(9): ResidualAttentionBlock(
|
| 140 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 141 |
+
(attn): MultiheadAttention(
|
| 142 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 143 |
+
)
|
| 144 |
+
(ls_1): Identity()
|
| 145 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 146 |
+
(mlp): Sequential(
|
| 147 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 148 |
+
(gelu): GELU(approximate='none')
|
| 149 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 150 |
+
)
|
| 151 |
+
(ls_2): Identity()
|
| 152 |
+
)
|
| 153 |
+
(10): ResidualAttentionBlock(
|
| 154 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 155 |
+
(attn): MultiheadAttention(
|
| 156 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 157 |
+
)
|
| 158 |
+
(ls_1): Identity()
|
| 159 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 160 |
+
(mlp): Sequential(
|
| 161 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 162 |
+
(gelu): GELU(approximate='none')
|
| 163 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 164 |
+
)
|
| 165 |
+
(ls_2): Identity()
|
| 166 |
+
)
|
| 167 |
+
(11): ResidualAttentionBlock(
|
| 168 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 169 |
+
(attn): MultiheadAttention(
|
| 170 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 171 |
+
)
|
| 172 |
+
(ls_1): Identity()
|
| 173 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 174 |
+
(mlp): Sequential(
|
| 175 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 176 |
+
(gelu): GELU(approximate='none')
|
| 177 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 178 |
+
)
|
| 179 |
+
(ls_2): Identity()
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 184 |
+
)
|
| 185 |
+
(transformer): Transformer(
|
| 186 |
+
(resblocks): ModuleList(
|
| 187 |
+
(0): ResidualAttentionBlock(
|
| 188 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 189 |
+
(attn): MultiheadAttention(
|
| 190 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 191 |
+
)
|
| 192 |
+
(ls_1): Identity()
|
| 193 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 194 |
+
(mlp): Sequential(
|
| 195 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 196 |
+
(gelu): GELU(approximate='none')
|
| 197 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 198 |
+
)
|
| 199 |
+
(ls_2): Identity()
|
| 200 |
+
)
|
| 201 |
+
(1): ResidualAttentionBlock(
|
| 202 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 203 |
+
(attn): MultiheadAttention(
|
| 204 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 205 |
+
)
|
| 206 |
+
(ls_1): Identity()
|
| 207 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 208 |
+
(mlp): Sequential(
|
| 209 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 210 |
+
(gelu): GELU(approximate='none')
|
| 211 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 212 |
+
)
|
| 213 |
+
(ls_2): Identity()
|
| 214 |
+
)
|
| 215 |
+
(2): ResidualAttentionBlock(
|
| 216 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 217 |
+
(attn): MultiheadAttention(
|
| 218 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 219 |
+
)
|
| 220 |
+
(ls_1): Identity()
|
| 221 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 222 |
+
(mlp): Sequential(
|
| 223 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 224 |
+
(gelu): GELU(approximate='none')
|
| 225 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 226 |
+
)
|
| 227 |
+
(ls_2): Identity()
|
| 228 |
+
)
|
| 229 |
+
(3): ResidualAttentionBlock(
|
| 230 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 231 |
+
(attn): MultiheadAttention(
|
| 232 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 233 |
+
)
|
| 234 |
+
(ls_1): Identity()
|
| 235 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 236 |
+
(mlp): Sequential(
|
| 237 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 238 |
+
(gelu): GELU(approximate='none')
|
| 239 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 240 |
+
)
|
| 241 |
+
(ls_2): Identity()
|
| 242 |
+
)
|
| 243 |
+
(4): ResidualAttentionBlock(
|
| 244 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 245 |
+
(attn): MultiheadAttention(
|
| 246 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 247 |
+
)
|
| 248 |
+
(ls_1): Identity()
|
| 249 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 250 |
+
(mlp): Sequential(
|
| 251 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 252 |
+
(gelu): GELU(approximate='none')
|
| 253 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 254 |
+
)
|
| 255 |
+
(ls_2): Identity()
|
| 256 |
+
)
|
| 257 |
+
(5): ResidualAttentionBlock(
|
| 258 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 259 |
+
(attn): MultiheadAttention(
|
| 260 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 261 |
+
)
|
| 262 |
+
(ls_1): Identity()
|
| 263 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 264 |
+
(mlp): Sequential(
|
| 265 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 266 |
+
(gelu): GELU(approximate='none')
|
| 267 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 268 |
+
)
|
| 269 |
+
(ls_2): Identity()
|
| 270 |
+
)
|
| 271 |
+
(6): ResidualAttentionBlock(
|
| 272 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 273 |
+
(attn): MultiheadAttention(
|
| 274 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 275 |
+
)
|
| 276 |
+
(ls_1): Identity()
|
| 277 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 278 |
+
(mlp): Sequential(
|
| 279 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 280 |
+
(gelu): GELU(approximate='none')
|
| 281 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 282 |
+
)
|
| 283 |
+
(ls_2): Identity()
|
| 284 |
+
)
|
| 285 |
+
(7): ResidualAttentionBlock(
|
| 286 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 287 |
+
(attn): MultiheadAttention(
|
| 288 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 289 |
+
)
|
| 290 |
+
(ls_1): Identity()
|
| 291 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 292 |
+
(mlp): Sequential(
|
| 293 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 294 |
+
(gelu): GELU(approximate='none')
|
| 295 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 296 |
+
)
|
| 297 |
+
(ls_2): Identity()
|
| 298 |
+
)
|
| 299 |
+
(8): ResidualAttentionBlock(
|
| 300 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 301 |
+
(attn): MultiheadAttention(
|
| 302 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 303 |
+
)
|
| 304 |
+
(ls_1): Identity()
|
| 305 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 306 |
+
(mlp): Sequential(
|
| 307 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 308 |
+
(gelu): GELU(approximate='none')
|
| 309 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 310 |
+
)
|
| 311 |
+
(ls_2): Identity()
|
| 312 |
+
)
|
| 313 |
+
(9): ResidualAttentionBlock(
|
| 314 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 315 |
+
(attn): MultiheadAttention(
|
| 316 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 317 |
+
)
|
| 318 |
+
(ls_1): Identity()
|
| 319 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 320 |
+
(mlp): Sequential(
|
| 321 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 322 |
+
(gelu): GELU(approximate='none')
|
| 323 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 324 |
+
)
|
| 325 |
+
(ls_2): Identity()
|
| 326 |
+
)
|
| 327 |
+
(10): ResidualAttentionBlock(
|
| 328 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 329 |
+
(attn): MultiheadAttention(
|
| 330 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 331 |
+
)
|
| 332 |
+
(ls_1): Identity()
|
| 333 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 334 |
+
(mlp): Sequential(
|
| 335 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 336 |
+
(gelu): GELU(approximate='none')
|
| 337 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 338 |
+
)
|
| 339 |
+
(ls_2): Identity()
|
| 340 |
+
)
|
| 341 |
+
(11): ResidualAttentionBlock(
|
| 342 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 343 |
+
(attn): MultiheadAttention(
|
| 344 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 345 |
+
)
|
| 346 |
+
(ls_1): Identity()
|
| 347 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 348 |
+
(mlp): Sequential(
|
| 349 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 350 |
+
(gelu): GELU(approximate='none')
|
| 351 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 352 |
+
)
|
| 353 |
+
(ls_2): Identity()
|
| 354 |
+
)
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
(token_embedding): Embedding(49408, 512)
|
| 358 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 359 |
+
)
|
| 360 |
+
2025-04-29,17:42:36 | INFO | Params:
|
| 361 |
+
2025-04-29,17:42:36 | INFO | accum_freq: 1
|
| 362 |
+
2025-04-29,17:42:36 | INFO | aug_cfg: {}
|
| 363 |
+
2025-04-29,17:42:36 | INFO | batch_size: 2048
|
| 364 |
+
2025-04-29,17:42:36 | INFO | beta1: 0.9
|
| 365 |
+
2025-04-29,17:42:36 | INFO | beta2: 0.98
|
| 366 |
+
2025-04-29,17:42:36 | INFO | checkpoint_path: /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/checkpoints
|
| 367 |
+
2025-04-29,17:42:36 | INFO | coca_caption_loss_weight: 2.0
|
| 368 |
+
2025-04-29,17:42:36 | INFO | coca_contrastive_loss_weight: 1.0
|
| 369 |
+
2025-04-29,17:42:36 | INFO | copy_codebase: False
|
| 370 |
+
2025-04-29,17:42:36 | INFO | csv_caption_key: title
|
| 371 |
+
2025-04-29,17:42:36 | INFO | csv_img_key: filepath
|
| 372 |
+
2025-04-29,17:42:36 | INFO | csv_separator:
|
| 373 |
+
2025-04-29,17:42:36 | INFO | dataset_resampled: True
|
| 374 |
+
2025-04-29,17:42:36 | INFO | dataset_type: webdataset
|
| 375 |
+
2025-04-29,17:42:36 | INFO | ddp_static_graph: True
|
| 376 |
+
2025-04-29,17:42:36 | INFO | debug: False
|
| 377 |
+
2025-04-29,17:42:36 | INFO | delete_previous_checkpoint: False
|
| 378 |
+
2025-04-29,17:42:36 | INFO | device: cuda:0
|
| 379 |
+
2025-04-29,17:42:36 | INFO | dist_backend: nccl
|
| 380 |
+
2025-04-29,17:42:36 | INFO | dist_url: env://
|
| 381 |
+
2025-04-29,17:42:36 | INFO | distill: False
|
| 382 |
+
2025-04-29,17:42:36 | INFO | distill_model: None
|
| 383 |
+
2025-04-29,17:42:36 | INFO | distill_pretrained: None
|
| 384 |
+
2025-04-29,17:42:36 | INFO | distributed: True
|
| 385 |
+
2025-04-29,17:42:36 | INFO | epochs: 8
|
| 386 |
+
2025-04-29,17:42:36 | INFO | epochs_cooldown: None
|
| 387 |
+
2025-04-29,17:42:36 | INFO | eps: 1e-06
|
| 388 |
+
2025-04-29,17:42:36 | INFO | force_custom_text: False
|
| 389 |
+
2025-04-29,17:42:36 | INFO | force_image_size: None
|
| 390 |
+
2025-04-29,17:42:36 | INFO | force_patch_dropout: None
|
| 391 |
+
2025-04-29,17:42:36 | INFO | force_quick_gelu: False
|
| 392 |
+
2025-04-29,17:42:36 | INFO | gather_with_grad: True
|
| 393 |
+
2025-04-29,17:42:36 | INFO | grad_checkpointing: True
|
| 394 |
+
2025-04-29,17:42:36 | INFO | grad_clip_norm: None
|
| 395 |
+
2025-04-29,17:42:36 | INFO | horovod: False
|
| 396 |
+
2025-04-29,17:42:36 | INFO | image_mean: None
|
| 397 |
+
2025-04-29,17:42:36 | INFO | image_std: None
|
| 398 |
+
2025-04-29,17:42:36 | INFO | imagenet_v2: None
|
| 399 |
+
2025-04-29,17:42:36 | INFO | imagenet_val: None
|
| 400 |
+
2025-04-29,17:42:36 | INFO | local_loss: True
|
| 401 |
+
2025-04-29,17:42:36 | INFO | local_rank: 0
|
| 402 |
+
2025-04-29,17:42:36 | INFO | lock_image: False
|
| 403 |
+
2025-04-29,17:42:36 | INFO | lock_image_freeze_bn_stats: False
|
| 404 |
+
2025-04-29,17:42:36 | INFO | lock_image_unlocked_groups: 0
|
| 405 |
+
2025-04-29,17:42:36 | INFO | lock_text: False
|
| 406 |
+
2025-04-29,17:42:36 | INFO | lock_text_freeze_layer_norm: False
|
| 407 |
+
2025-04-29,17:42:36 | INFO | lock_text_unlocked_layers: 0
|
| 408 |
+
2025-04-29,17:42:36 | INFO | log_every_n_steps: 100
|
| 409 |
+
2025-04-29,17:42:36 | INFO | log_level: 20
|
| 410 |
+
2025-04-29,17:42:36 | INFO | log_local: False
|
| 411 |
+
2025-04-29,17:42:36 | INFO | log_path: /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/out.log
|
| 412 |
+
2025-04-29,17:42:36 | INFO | logs: /mnt/personal/zhudongy/datacomp_results/medium
|
| 413 |
+
2025-04-29,17:42:36 | INFO | lr: 0.0005
|
| 414 |
+
2025-04-29,17:42:36 | INFO | lr_cooldown_end: 0.0
|
| 415 |
+
2025-04-29,17:42:36 | INFO | lr_cooldown_power: 1.0
|
| 416 |
+
2025-04-29,17:42:36 | INFO | lr_scheduler: cosine
|
| 417 |
+
2025-04-29,17:42:36 | INFO | model: ViT-B-32
|
| 418 |
+
2025-04-29,17:42:36 | INFO | name: low_inter_only
|
| 419 |
+
2025-04-29,17:42:36 | INFO | no_set_device_rank: False
|
| 420 |
+
2025-04-29,17:42:36 | INFO | precision: amp_bfloat16
|
| 421 |
+
2025-04-29,17:42:36 | INFO | pretrained:
|
| 422 |
+
2025-04-29,17:42:36 | INFO | pretrained_image: False
|
| 423 |
+
2025-04-29,17:42:36 | INFO | rank: 0
|
| 424 |
+
2025-04-29,17:42:36 | INFO | remote_sync: None
|
| 425 |
+
2025-04-29,17:42:36 | INFO | remote_sync_frequency: 300
|
| 426 |
+
2025-04-29,17:42:36 | INFO | remote_sync_protocol: s3
|
| 427 |
+
2025-04-29,17:42:36 | INFO | report_to:
|
| 428 |
+
2025-04-29,17:42:36 | INFO | resume: None
|
| 429 |
+
2025-04-29,17:42:36 | INFO | save_frequency: 0
|
| 430 |
+
2025-04-29,17:42:36 | INFO | save_most_recent: True
|
| 431 |
+
2025-04-29,17:42:36 | INFO | seed: 0
|
| 432 |
+
2025-04-29,17:42:36 | INFO | skip_scheduler: False
|
| 433 |
+
2025-04-29,17:42:36 | INFO | tensorboard: False
|
| 434 |
+
2025-04-29,17:42:36 | INFO | tensorboard_path:
|
| 435 |
+
2025-04-29,17:42:36 | INFO | torchscript: False
|
| 436 |
+
2025-04-29,17:42:36 | INFO | trace: False
|
| 437 |
+
2025-04-29,17:42:36 | INFO | train_data: /mnt/personal/zhudongy/datacomp-medium/shards/0000{0000..6126}.tar
|
| 438 |
+
2025-04-29,17:42:36 | INFO | train_data_upsampling_factors: None
|
| 439 |
+
2025-04-29,17:42:36 | INFO | train_num_samples: 16000000
|
| 440 |
+
2025-04-29,17:42:36 | INFO | use_bn_sync: False
|
| 441 |
+
2025-04-29,17:42:36 | INFO | val_data: None
|
| 442 |
+
2025-04-29,17:42:36 | INFO | val_frequency: 1
|
| 443 |
+
2025-04-29,17:42:36 | INFO | val_num_samples: None
|
| 444 |
+
2025-04-29,17:42:36 | INFO | wandb: False
|
| 445 |
+
2025-04-29,17:42:36 | INFO | wandb_notes:
|
| 446 |
+
2025-04-29,17:42:36 | INFO | wandb_project_name: open-clip
|
| 447 |
+
2025-04-29,17:42:36 | INFO | warmup: 500
|
| 448 |
+
2025-04-29,17:42:36 | INFO | wd: 0.2
|
| 449 |
+
2025-04-29,17:42:36 | INFO | workers: 16
|
| 450 |
+
2025-04-29,17:42:36 | INFO | world_size: 2
|
| 451 |
+
2025-04-29,17:42:36 | INFO | zeroshot_frequency: 2
|
| 452 |
+
2025-04-29,17:42:36 | INFO | Start epoch 0
|
| 453 |
+
2025-04-29,17:43:01 | INFO | Train Epoch: 0 [ 4096/16056320 (0%)] Data (t): 21.925 Batch (t): 24.947, 164.187/s, 82.0936/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3837 (8.3837) Loss: 8.3837 (8.3837)
|
| 454 |
+
2025-04-29,17:43:04 | INFO | Reducer buckets have been rebuilt in this iteration.
|
| 455 |
+
2025-04-29,17:47:03 | INFO | Train Epoch: 0 [ 413696/16056320 (3%)] Data (t): 0.367 Batch (t): 2.418, 1701.28/s, 850.642/s/gpu LR: 0.000101 Logit Scale: 14.261 Contrastive_loss: 8.1890 (8.2863) Loss: 8.1890 (8.2863)
|
| 456 |
+
2025-04-29,17:51:07 | INFO | Train Epoch: 0 [ 823296/16056320 (5%)] Data (t): 0.400 Batch (t): 2.442, 1697.43/s, 848.716/s/gpu LR: 0.000201 Logit Scale: 14.237 Contrastive_loss: 8.0558 (8.2095) Loss: 8.0558 (8.2095)
|
| 457 |
+
2025-04-29,17:55:15 | INFO | Train Epoch: 0 [ 1232896/16056320 (8%)] Data (t): 0.436 Batch (t): 2.479, 1693.55/s, 846.774/s/gpu LR: 0.000301 Logit Scale: 14.210 Contrastive_loss: 7.9360 (8.1411) Loss: 7.9360 (8.1411)
|
| 458 |
+
2025-04-29,17:59:16 | INFO | Train Epoch: 0 [ 1642496/16056320 (10%)] Data (t): 0.372 Batch (t): 2.411, 1693.62/s, 846.809/s/gpu LR: 0.000401 Logit Scale: 14.185 Contrastive_loss: 7.8394 (8.0808) Loss: 7.8394 (8.0808)
|
| 459 |
+
2025-04-29,18:03:17 | INFO | Train Epoch: 0 [ 2052096/16056320 (13%)] Data (t): 0.364 Batch (t): 2.407, 1698.74/s, 849.369/s/gpu LR: 0.000500 Logit Scale: 14.182 Contrastive_loss: 7.7843 (8.0314) Loss: 7.7843 (8.0314)
|
| 460 |
+
2025-04-29,18:07:18 | INFO | Train Epoch: 0 [ 2461696/16056320 (15%)] Data (t): 0.369 Batch (t): 2.413, 1692.95/s, 846.474/s/gpu LR: 0.000500 Logit Scale: 14.201 Contrastive_loss: 7.6211 (7.9728) Loss: 7.6211 (7.9728)
|
| 461 |
+
2025-04-29,18:11:19 | INFO | Train Epoch: 0 [ 2871296/16056320 (18%)] Data (t): 0.365 Batch (t): 2.407, 1707.98/s, 853.988/s/gpu LR: 0.000500 Logit Scale: 14.260 Contrastive_loss: 7.5299 (7.9174) Loss: 7.5299 (7.9174)
|
| 462 |
+
2025-04-29,18:15:20 | INFO | Train Epoch: 0 [ 3280896/16056320 (20%)] Data (t): 0.369 Batch (t): 2.410, 1690.42/s, 845.211/s/gpu LR: 0.000500 Logit Scale: 14.340 Contrastive_loss: 7.4407 (7.8644) Loss: 7.4407 (7.8644)
|
| 463 |
+
2025-04-29,18:19:24 | INFO | Train Epoch: 0 [ 3690496/16056320 (23%)] Data (t): 0.393 Batch (t): 2.436, 1701.12/s, 850.559/s/gpu LR: 0.000500 Logit Scale: 14.441 Contrastive_loss: 7.4649 (7.8245) Loss: 7.4649 (7.8245)
|
| 464 |
+
2025-04-29,18:23:25 | INFO | Train Epoch: 0 [ 4100096/16056320 (26%)] Data (t): 0.378 Batch (t): 2.417, 1712.22/s, 856.110/s/gpu LR: 0.000500 Logit Scale: 14.582 Contrastive_loss: 7.3469 (7.7811) Loss: 7.3469 (7.7811)
|
| 465 |
+
2025-04-29,18:27:27 | INFO | Train Epoch: 0 [ 4509696/16056320 (28%)] Data (t): 0.372 Batch (t): 2.413, 1699.89/s, 849.943/s/gpu LR: 0.000500 Logit Scale: 14.727 Contrastive_loss: 7.2976 (7.7408) Loss: 7.2976 (7.7408)
|
| 466 |
+
2025-04-29,18:31:29 | INFO | Train Epoch: 0 [ 4919296/16056320 (31%)] Data (t): 0.378 Batch (t): 2.418, 1680.76/s, 840.382/s/gpu LR: 0.000499 Logit Scale: 14.942 Contrastive_loss: 7.2187 (7.7006) Loss: 7.2187 (7.7006)
|
| 467 |
+
2025-04-29,18:35:30 | INFO | Train Epoch: 0 [ 5328896/16056320 (33%)] Data (t): 0.375 Batch (t): 2.417, 1677.33/s, 838.665/s/gpu LR: 0.000499 Logit Scale: 15.128 Contrastive_loss: 7.1773 (7.6632) Loss: 7.1773 (7.6632)
|
| 468 |
+
2025-04-29,18:39:32 | INFO | Train Epoch: 0 [ 5738496/16056320 (36%)] Data (t): 0.373 Batch (t): 2.416, 1698.13/s, 849.067/s/gpu LR: 0.000499 Logit Scale: 15.344 Contrastive_loss: 7.0983 (7.6256) Loss: 7.0983 (7.6256)
|
| 469 |
+
2025-04-29,18:43:35 | INFO | Train Epoch: 0 [ 6148096/16056320 (38%)] Data (t): 0.390 Batch (t): 2.433, 1681.07/s, 840.537/s/gpu LR: 0.000499 Logit Scale: 15.595 Contrastive_loss: 7.0212 (7.5878) Loss: 7.0212 (7.5878)
|
| 470 |
+
2025-04-29,18:47:38 | INFO | Train Epoch: 0 [ 6557696/16056320 (41%)] Data (t): 0.390 Batch (t): 2.430, 1683.60/s, 841.802/s/gpu LR: 0.000498 Logit Scale: 15.826 Contrastive_loss: 6.9587 (7.5508) Loss: 6.9587 (7.5508)
|
| 471 |
+
2025-04-29,18:51:40 | INFO | Train Epoch: 0 [ 6967296/16056320 (43%)] Data (t): 0.378 Batch (t): 2.418, 1698.91/s, 849.455/s/gpu LR: 0.000498 Logit Scale: 16.115 Contrastive_loss: 6.8671 (7.5128) Loss: 6.8671 (7.5128)
|
| 472 |
+
2025-04-29,18:55:42 | INFO | Train Epoch: 0 [ 7376896/16056320 (46%)] Data (t): 0.380 Batch (t): 2.423, 1679.70/s, 839.848/s/gpu LR: 0.000498 Logit Scale: 16.413 Contrastive_loss: 6.9072 (7.4809) Loss: 6.9072 (7.4809)
|
| 473 |
+
2025-04-29,18:59:46 | INFO | Train Epoch: 0 [ 7786496/16056320 (48%)] Data (t): 0.390 Batch (t): 2.432, 1697.74/s, 848.868/s/gpu LR: 0.000497 Logit Scale: 16.734 Contrastive_loss: 6.8222 (7.4480) Loss: 6.8222 (7.4480)
|
| 474 |
+
2025-04-29,19:03:50 | INFO | Train Epoch: 0 [ 8196096/16056320 (51%)] Data (t): 0.392 Batch (t): 2.440, 1684.68/s, 842.340/s/gpu LR: 0.000497 Logit Scale: 17.017 Contrastive_loss: 6.7487 (7.4147) Loss: 6.7487 (7.4147)
|
| 475 |
+
2025-04-29,19:07:55 | INFO | Train Epoch: 0 [ 8605696/16056320 (54%)] Data (t): 0.414 Batch (t): 2.455, 1699.34/s, 849.669/s/gpu LR: 0.000497 Logit Scale: 17.283 Contrastive_loss: 6.6749 (7.3811) Loss: 6.6749 (7.3811)
|
| 476 |
+
2025-04-29,19:11:58 | INFO | Train Epoch: 0 [ 9015296/16056320 (56%)] Data (t): 0.384 Batch (t): 2.425, 1704.23/s, 852.116/s/gpu LR: 0.000496 Logit Scale: 17.559 Contrastive_loss: 6.5998 (7.3471) Loss: 6.5998 (7.3471)
|
| 477 |
+
2025-04-29,19:16:01 | INFO | Train Epoch: 0 [ 9424896/16056320 (59%)] Data (t): 0.387 Batch (t): 2.438, 1666.65/s, 833.323/s/gpu LR: 0.000496 Logit Scale: 17.832 Contrastive_loss: 6.5839 (7.3153) Loss: 6.5839 (7.3153)
|
| 478 |
+
2025-04-29,19:20:05 | INFO | Train Epoch: 0 [ 9834496/16056320 (61%)] Data (t): 0.393 Batch (t): 2.432, 1699.78/s, 849.890/s/gpu LR: 0.000495 Logit Scale: 18.053 Contrastive_loss: 6.4805 (7.2819) Loss: 6.4805 (7.2819)
|
| 479 |
+
2025-04-29,19:24:07 | INFO | Train Epoch: 0 [10244096/16056320 (64%)] Data (t): 0.382 Batch (t): 2.425, 1685.49/s, 842.747/s/gpu LR: 0.000495 Logit Scale: 18.360 Contrastive_loss: 6.5788 (7.2549) Loss: 6.5788 (7.2549)
|
| 480 |
+
2025-04-29,19:28:10 | INFO | Train Epoch: 0 [10653696/16056320 (66%)] Data (t): 0.391 Batch (t): 2.431, 1693.68/s, 846.839/s/gpu LR: 0.000494 Logit Scale: 18.648 Contrastive_loss: 6.4210 (7.2240) Loss: 6.4210 (7.2240)
|
| 481 |
+
2025-04-29,19:32:14 | INFO | Train Epoch: 0 [11063296/16056320 (69%)] Data (t): 0.396 Batch (t): 2.438, 1692.32/s, 846.162/s/gpu LR: 0.000494 Logit Scale: 18.913 Contrastive_loss: 6.4979 (7.1981) Loss: 6.4979 (7.1981)
|
| 482 |
+
2025-04-29,19:36:17 | INFO | Train Epoch: 0 [11472896/16056320 (71%)] Data (t): 0.390 Batch (t): 2.429, 1678.49/s, 839.246/s/gpu LR: 0.000493 Logit Scale: 19.224 Contrastive_loss: 6.4005 (7.1706) Loss: 6.4005 (7.1706)
|
| 483 |
+
2025-04-29,19:40:20 | INFO | Train Epoch: 0 [11882496/16056320 (74%)] Data (t): 0.387 Batch (t): 2.430, 1678.26/s, 839.130/s/gpu LR: 0.000493 Logit Scale: 19.531 Contrastive_loss: 6.3682 (7.1438) Loss: 6.3682 (7.1438)
|
| 484 |
+
2025-04-29,19:44:23 | INFO | Train Epoch: 0 [12292096/16056320 (77%)] Data (t): 0.396 Batch (t): 2.436, 1677.17/s, 838.587/s/gpu LR: 0.000492 Logit Scale: 19.819 Contrastive_loss: 6.1071 (7.1104) Loss: 6.1071 (7.1104)
|
| 485 |
+
2025-04-29,19:48:27 | INFO | Train Epoch: 0 [12701696/16056320 (79%)] Data (t): 0.398 Batch (t): 2.438, 1696.85/s, 848.426/s/gpu LR: 0.000491 Logit Scale: 20.042 Contrastive_loss: 6.3618 (7.0870) Loss: 6.3618 (7.0870)
|
| 486 |
+
2025-04-29,19:52:31 | INFO | Train Epoch: 0 [13111296/16056320 (82%)] Data (t): 0.398 Batch (t): 2.439, 1606.99/s, 803.494/s/gpu LR: 0.000491 Logit Scale: 20.290 Contrastive_loss: 6.2285 (7.0610) Loss: 6.2285 (7.0610)
|
| 487 |
+
2025-04-29,19:56:35 | INFO | Train Epoch: 0 [13520896/16056320 (84%)] Data (t): 0.402 Batch (t): 2.443, 1668.95/s, 834.473/s/gpu LR: 0.000490 Logit Scale: 20.522 Contrastive_loss: 6.0085 (7.0300) Loss: 6.0085 (7.0300)
|
| 488 |
+
2025-04-29,20:00:38 | INFO | Train Epoch: 0 [13930496/16056320 (87%)] Data (t): 0.389 Batch (t): 2.429, 1705.41/s, 852.704/s/gpu LR: 0.000489 Logit Scale: 20.775 Contrastive_loss: 6.2236 (7.0070) Loss: 6.2236 (7.0070)
|
| 489 |
+
2025-04-29,20:04:42 | INFO | Train Epoch: 0 [14340096/16056320 (89%)] Data (t): 0.397 Batch (t): 2.439, 1616.17/s, 808.085/s/gpu LR: 0.000488 Logit Scale: 21.015 Contrastive_loss: 6.1521 (6.9832) Loss: 6.1521 (6.9832)
|
| 490 |
+
2025-04-29,20:08:46 | INFO | Train Epoch: 0 [14749696/16056320 (92%)] Data (t): 0.396 Batch (t): 2.438, 1700.78/s, 850.392/s/gpu LR: 0.000488 Logit Scale: 21.230 Contrastive_loss: 6.1918 (6.9618) Loss: 6.1918 (6.9618)
|
| 491 |
+
2025-04-29,20:12:49 | INFO | Train Epoch: 0 [15159296/16056320 (94%)] Data (t): 0.390 Batch (t): 2.432, 1695.14/s, 847.571/s/gpu LR: 0.000487 Logit Scale: 21.468 Contrastive_loss: 6.1201 (6.9397) Loss: 6.1201 (6.9397)
|
| 492 |
+
2025-04-29,20:16:52 | INFO | Train Epoch: 0 [15568896/16056320 (97%)] Data (t): 0.390 Batch (t): 2.432, 1704.81/s, 852.404/s/gpu LR: 0.000486 Logit Scale: 21.638 Contrastive_loss: 6.0887 (6.9179) Loss: 6.0887 (6.9179)
|
medium/low_inter_only/params.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 1
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
checkpoint_path: /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/checkpoints
|
| 7 |
+
coca_caption_loss_weight: 2.0
|
| 8 |
+
coca_contrastive_loss_weight: 1.0
|
| 9 |
+
copy_codebase: False
|
| 10 |
+
csv_caption_key: title
|
| 11 |
+
csv_img_key: filepath
|
| 12 |
+
csv_separator:
|
| 13 |
+
dataset_resampled: True
|
| 14 |
+
dataset_type: webdataset
|
| 15 |
+
ddp_static_graph: True
|
| 16 |
+
debug: False
|
| 17 |
+
delete_previous_checkpoint: False
|
| 18 |
+
device: cuda:0
|
| 19 |
+
dist_backend: nccl
|
| 20 |
+
dist_url: env://
|
| 21 |
+
distill: False
|
| 22 |
+
distill_model: None
|
| 23 |
+
distill_pretrained: None
|
| 24 |
+
distributed: True
|
| 25 |
+
epochs: 8
|
| 26 |
+
epochs_cooldown: None
|
| 27 |
+
eps: 1e-06
|
| 28 |
+
force_custom_text: False
|
| 29 |
+
force_image_size: None
|
| 30 |
+
force_patch_dropout: None
|
| 31 |
+
force_quick_gelu: False
|
| 32 |
+
gather_with_grad: True
|
| 33 |
+
grad_checkpointing: True
|
| 34 |
+
grad_clip_norm: None
|
| 35 |
+
horovod: False
|
| 36 |
+
image_mean: None
|
| 37 |
+
image_std: None
|
| 38 |
+
imagenet_v2: None
|
| 39 |
+
imagenet_val: None
|
| 40 |
+
local_loss: True
|
| 41 |
+
local_rank: 0
|
| 42 |
+
lock_image: False
|
| 43 |
+
lock_image_freeze_bn_stats: False
|
| 44 |
+
lock_image_unlocked_groups: 0
|
| 45 |
+
lock_text: False
|
| 46 |
+
lock_text_freeze_layer_norm: False
|
| 47 |
+
lock_text_unlocked_layers: 0
|
| 48 |
+
log_every_n_steps: 100
|
| 49 |
+
log_level: 20
|
| 50 |
+
log_local: False
|
| 51 |
+
log_path: /mnt/personal/zhudongy/datacomp_results/medium/low_inter_only/out.log
|
| 52 |
+
logs: /mnt/personal/zhudongy/datacomp_results/medium
|
| 53 |
+
lr: 0.0005
|
| 54 |
+
lr_cooldown_end: 0.0
|
| 55 |
+
lr_cooldown_power: 1.0
|
| 56 |
+
lr_scheduler: cosine
|
| 57 |
+
model: ViT-B-32
|
| 58 |
+
name: low_inter_only
|
| 59 |
+
no_set_device_rank: False
|
| 60 |
+
precision: amp_bfloat16
|
| 61 |
+
pretrained:
|
| 62 |
+
pretrained_image: False
|
| 63 |
+
rank: 0
|
| 64 |
+
remote_sync: None
|
| 65 |
+
remote_sync_frequency: 300
|
| 66 |
+
remote_sync_protocol: s3
|
| 67 |
+
report_to:
|
| 68 |
+
resume: None
|
| 69 |
+
save_frequency: 0
|
| 70 |
+
save_most_recent: True
|
| 71 |
+
seed: 0
|
| 72 |
+
skip_scheduler: False
|
| 73 |
+
tensorboard: False
|
| 74 |
+
tensorboard_path:
|
| 75 |
+
torchscript: False
|
| 76 |
+
trace: False
|
| 77 |
+
train_data: /mnt/personal/zhudongy/datacomp-medium/shards/0000{0000..6126}.tar
|
| 78 |
+
train_data_upsampling_factors: None
|
| 79 |
+
train_num_samples: 16000000
|
| 80 |
+
use_bn_sync: False
|
| 81 |
+
val_data: None
|
| 82 |
+
val_frequency: 1
|
| 83 |
+
val_num_samples: None
|
| 84 |
+
wandb: False
|
| 85 |
+
wandb_notes:
|
| 86 |
+
wandb_project_name: open-clip
|
| 87 |
+
warmup: 500
|
| 88 |
+
wd: 0.2
|
| 89 |
+
workers: 16
|
| 90 |
+
world_size: 2
|
| 91 |
+
zeroshot_frequency: 2
|
small/low_inter_only/checkpoints/epoch_8.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:247492627c790300dee284825b5ad2005db17d7fa70c82e8be7df2e6ef993640
|
| 3 |
+
size 1815701409
|
small/low_inter_only/checkpoints/epoch_latest.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96b2e076abd25e8d3fe415b63877c2e4ffb132b86d74e96456315db9dc1151ca
|
| 3 |
+
size 1815639097
|
small/low_inter_only/info.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa8ebb9d239573ea811debb6029bc8b30f50b2fc463d67e42e6105093303054b
|
| 3 |
+
size 382
|
small/low_inter_only/out.log
ADDED
|
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-04-29,10:46:51 | INFO | No latest resume checkpoint found in /mnt/personal/zhudongy/datacomp_results/small/low_inter_only/checkpoints.
|
| 2 |
+
2025-04-29,10:46:53 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 2.
|
| 3 |
+
2025-04-29,10:46:53 | INFO | Loaded ViT-B-32 model config.
|
| 4 |
+
2025-04-29,10:46:54 | INFO | Model:
|
| 5 |
+
2025-04-29,10:46:54 | INFO | CLIP(
|
| 6 |
+
(visual): VisionTransformer(
|
| 7 |
+
(patchnorm_pre_ln): Identity()
|
| 8 |
+
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
|
| 9 |
+
(patch_dropout): Identity()
|
| 10 |
+
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 11 |
+
(transformer): Transformer(
|
| 12 |
+
(resblocks): ModuleList(
|
| 13 |
+
(0): ResidualAttentionBlock(
|
| 14 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 15 |
+
(attn): MultiheadAttention(
|
| 16 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 17 |
+
)
|
| 18 |
+
(ls_1): Identity()
|
| 19 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 20 |
+
(mlp): Sequential(
|
| 21 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 22 |
+
(gelu): GELU(approximate='none')
|
| 23 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 24 |
+
)
|
| 25 |
+
(ls_2): Identity()
|
| 26 |
+
)
|
| 27 |
+
(1): ResidualAttentionBlock(
|
| 28 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 29 |
+
(attn): MultiheadAttention(
|
| 30 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 31 |
+
)
|
| 32 |
+
(ls_1): Identity()
|
| 33 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 34 |
+
(mlp): Sequential(
|
| 35 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 36 |
+
(gelu): GELU(approximate='none')
|
| 37 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 38 |
+
)
|
| 39 |
+
(ls_2): Identity()
|
| 40 |
+
)
|
| 41 |
+
(2): ResidualAttentionBlock(
|
| 42 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 43 |
+
(attn): MultiheadAttention(
|
| 44 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 45 |
+
)
|
| 46 |
+
(ls_1): Identity()
|
| 47 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 48 |
+
(mlp): Sequential(
|
| 49 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 50 |
+
(gelu): GELU(approximate='none')
|
| 51 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 52 |
+
)
|
| 53 |
+
(ls_2): Identity()
|
| 54 |
+
)
|
| 55 |
+
(3): ResidualAttentionBlock(
|
| 56 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 57 |
+
(attn): MultiheadAttention(
|
| 58 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 59 |
+
)
|
| 60 |
+
(ls_1): Identity()
|
| 61 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 62 |
+
(mlp): Sequential(
|
| 63 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 64 |
+
(gelu): GELU(approximate='none')
|
| 65 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 66 |
+
)
|
| 67 |
+
(ls_2): Identity()
|
| 68 |
+
)
|
| 69 |
+
(4): ResidualAttentionBlock(
|
| 70 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 71 |
+
(attn): MultiheadAttention(
|
| 72 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 73 |
+
)
|
| 74 |
+
(ls_1): Identity()
|
| 75 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 76 |
+
(mlp): Sequential(
|
| 77 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 78 |
+
(gelu): GELU(approximate='none')
|
| 79 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 80 |
+
)
|
| 81 |
+
(ls_2): Identity()
|
| 82 |
+
)
|
| 83 |
+
(5): ResidualAttentionBlock(
|
| 84 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 85 |
+
(attn): MultiheadAttention(
|
| 86 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 87 |
+
)
|
| 88 |
+
(ls_1): Identity()
|
| 89 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 90 |
+
(mlp): Sequential(
|
| 91 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 92 |
+
(gelu): GELU(approximate='none')
|
| 93 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 94 |
+
)
|
| 95 |
+
(ls_2): Identity()
|
| 96 |
+
)
|
| 97 |
+
(6): ResidualAttentionBlock(
|
| 98 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 99 |
+
(attn): MultiheadAttention(
|
| 100 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 101 |
+
)
|
| 102 |
+
(ls_1): Identity()
|
| 103 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 104 |
+
(mlp): Sequential(
|
| 105 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 106 |
+
(gelu): GELU(approximate='none')
|
| 107 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 108 |
+
)
|
| 109 |
+
(ls_2): Identity()
|
| 110 |
+
)
|
| 111 |
+
(7): ResidualAttentionBlock(
|
| 112 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 113 |
+
(attn): MultiheadAttention(
|
| 114 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 115 |
+
)
|
| 116 |
+
(ls_1): Identity()
|
| 117 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 118 |
+
(mlp): Sequential(
|
| 119 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 120 |
+
(gelu): GELU(approximate='none')
|
| 121 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 122 |
+
)
|
| 123 |
+
(ls_2): Identity()
|
| 124 |
+
)
|
| 125 |
+
(8): ResidualAttentionBlock(
|
| 126 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 127 |
+
(attn): MultiheadAttention(
|
| 128 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 129 |
+
)
|
| 130 |
+
(ls_1): Identity()
|
| 131 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 132 |
+
(mlp): Sequential(
|
| 133 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 134 |
+
(gelu): GELU(approximate='none')
|
| 135 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 136 |
+
)
|
| 137 |
+
(ls_2): Identity()
|
| 138 |
+
)
|
| 139 |
+
(9): ResidualAttentionBlock(
|
| 140 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 141 |
+
(attn): MultiheadAttention(
|
| 142 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 143 |
+
)
|
| 144 |
+
(ls_1): Identity()
|
| 145 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 146 |
+
(mlp): Sequential(
|
| 147 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 148 |
+
(gelu): GELU(approximate='none')
|
| 149 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 150 |
+
)
|
| 151 |
+
(ls_2): Identity()
|
| 152 |
+
)
|
| 153 |
+
(10): ResidualAttentionBlock(
|
| 154 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 155 |
+
(attn): MultiheadAttention(
|
| 156 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 157 |
+
)
|
| 158 |
+
(ls_1): Identity()
|
| 159 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 160 |
+
(mlp): Sequential(
|
| 161 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 162 |
+
(gelu): GELU(approximate='none')
|
| 163 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 164 |
+
)
|
| 165 |
+
(ls_2): Identity()
|
| 166 |
+
)
|
| 167 |
+
(11): ResidualAttentionBlock(
|
| 168 |
+
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 169 |
+
(attn): MultiheadAttention(
|
| 170 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
|
| 171 |
+
)
|
| 172 |
+
(ls_1): Identity()
|
| 173 |
+
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 174 |
+
(mlp): Sequential(
|
| 175 |
+
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
|
| 176 |
+
(gelu): GELU(approximate='none')
|
| 177 |
+
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
|
| 178 |
+
)
|
| 179 |
+
(ls_2): Identity()
|
| 180 |
+
)
|
| 181 |
+
)
|
| 182 |
+
)
|
| 183 |
+
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
|
| 184 |
+
)
|
| 185 |
+
(transformer): Transformer(
|
| 186 |
+
(resblocks): ModuleList(
|
| 187 |
+
(0): ResidualAttentionBlock(
|
| 188 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 189 |
+
(attn): MultiheadAttention(
|
| 190 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 191 |
+
)
|
| 192 |
+
(ls_1): Identity()
|
| 193 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 194 |
+
(mlp): Sequential(
|
| 195 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 196 |
+
(gelu): GELU(approximate='none')
|
| 197 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 198 |
+
)
|
| 199 |
+
(ls_2): Identity()
|
| 200 |
+
)
|
| 201 |
+
(1): ResidualAttentionBlock(
|
| 202 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 203 |
+
(attn): MultiheadAttention(
|
| 204 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 205 |
+
)
|
| 206 |
+
(ls_1): Identity()
|
| 207 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 208 |
+
(mlp): Sequential(
|
| 209 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 210 |
+
(gelu): GELU(approximate='none')
|
| 211 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 212 |
+
)
|
| 213 |
+
(ls_2): Identity()
|
| 214 |
+
)
|
| 215 |
+
(2): ResidualAttentionBlock(
|
| 216 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 217 |
+
(attn): MultiheadAttention(
|
| 218 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 219 |
+
)
|
| 220 |
+
(ls_1): Identity()
|
| 221 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 222 |
+
(mlp): Sequential(
|
| 223 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 224 |
+
(gelu): GELU(approximate='none')
|
| 225 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 226 |
+
)
|
| 227 |
+
(ls_2): Identity()
|
| 228 |
+
)
|
| 229 |
+
(3): ResidualAttentionBlock(
|
| 230 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 231 |
+
(attn): MultiheadAttention(
|
| 232 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 233 |
+
)
|
| 234 |
+
(ls_1): Identity()
|
| 235 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 236 |
+
(mlp): Sequential(
|
| 237 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 238 |
+
(gelu): GELU(approximate='none')
|
| 239 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 240 |
+
)
|
| 241 |
+
(ls_2): Identity()
|
| 242 |
+
)
|
| 243 |
+
(4): ResidualAttentionBlock(
|
| 244 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 245 |
+
(attn): MultiheadAttention(
|
| 246 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 247 |
+
)
|
| 248 |
+
(ls_1): Identity()
|
| 249 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 250 |
+
(mlp): Sequential(
|
| 251 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 252 |
+
(gelu): GELU(approximate='none')
|
| 253 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 254 |
+
)
|
| 255 |
+
(ls_2): Identity()
|
| 256 |
+
)
|
| 257 |
+
(5): ResidualAttentionBlock(
|
| 258 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 259 |
+
(attn): MultiheadAttention(
|
| 260 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 261 |
+
)
|
| 262 |
+
(ls_1): Identity()
|
| 263 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 264 |
+
(mlp): Sequential(
|
| 265 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 266 |
+
(gelu): GELU(approximate='none')
|
| 267 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 268 |
+
)
|
| 269 |
+
(ls_2): Identity()
|
| 270 |
+
)
|
| 271 |
+
(6): ResidualAttentionBlock(
|
| 272 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 273 |
+
(attn): MultiheadAttention(
|
| 274 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 275 |
+
)
|
| 276 |
+
(ls_1): Identity()
|
| 277 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 278 |
+
(mlp): Sequential(
|
| 279 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 280 |
+
(gelu): GELU(approximate='none')
|
| 281 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 282 |
+
)
|
| 283 |
+
(ls_2): Identity()
|
| 284 |
+
)
|
| 285 |
+
(7): ResidualAttentionBlock(
|
| 286 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 287 |
+
(attn): MultiheadAttention(
|
| 288 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 289 |
+
)
|
| 290 |
+
(ls_1): Identity()
|
| 291 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 292 |
+
(mlp): Sequential(
|
| 293 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 294 |
+
(gelu): GELU(approximate='none')
|
| 295 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 296 |
+
)
|
| 297 |
+
(ls_2): Identity()
|
| 298 |
+
)
|
| 299 |
+
(8): ResidualAttentionBlock(
|
| 300 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 301 |
+
(attn): MultiheadAttention(
|
| 302 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 303 |
+
)
|
| 304 |
+
(ls_1): Identity()
|
| 305 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 306 |
+
(mlp): Sequential(
|
| 307 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 308 |
+
(gelu): GELU(approximate='none')
|
| 309 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 310 |
+
)
|
| 311 |
+
(ls_2): Identity()
|
| 312 |
+
)
|
| 313 |
+
(9): ResidualAttentionBlock(
|
| 314 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 315 |
+
(attn): MultiheadAttention(
|
| 316 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 317 |
+
)
|
| 318 |
+
(ls_1): Identity()
|
| 319 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 320 |
+
(mlp): Sequential(
|
| 321 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 322 |
+
(gelu): GELU(approximate='none')
|
| 323 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 324 |
+
)
|
| 325 |
+
(ls_2): Identity()
|
| 326 |
+
)
|
| 327 |
+
(10): ResidualAttentionBlock(
|
| 328 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 329 |
+
(attn): MultiheadAttention(
|
| 330 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 331 |
+
)
|
| 332 |
+
(ls_1): Identity()
|
| 333 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 334 |
+
(mlp): Sequential(
|
| 335 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 336 |
+
(gelu): GELU(approximate='none')
|
| 337 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 338 |
+
)
|
| 339 |
+
(ls_2): Identity()
|
| 340 |
+
)
|
| 341 |
+
(11): ResidualAttentionBlock(
|
| 342 |
+
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 343 |
+
(attn): MultiheadAttention(
|
| 344 |
+
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
|
| 345 |
+
)
|
| 346 |
+
(ls_1): Identity()
|
| 347 |
+
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 348 |
+
(mlp): Sequential(
|
| 349 |
+
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
|
| 350 |
+
(gelu): GELU(approximate='none')
|
| 351 |
+
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
|
| 352 |
+
)
|
| 353 |
+
(ls_2): Identity()
|
| 354 |
+
)
|
| 355 |
+
)
|
| 356 |
+
)
|
| 357 |
+
(token_embedding): Embedding(49408, 512)
|
| 358 |
+
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
|
| 359 |
+
)
|
| 360 |
+
2025-04-29,10:46:54 | INFO | Params:
|
| 361 |
+
2025-04-29,10:46:54 | INFO | accum_freq: 1
|
| 362 |
+
2025-04-29,10:46:54 | INFO | aug_cfg: {}
|
| 363 |
+
2025-04-29,10:46:54 | INFO | batch_size: 2048
|
| 364 |
+
2025-04-29,10:46:54 | INFO | beta1: 0.9
|
| 365 |
+
2025-04-29,10:46:54 | INFO | beta2: 0.98
|
| 366 |
+
2025-04-29,10:46:54 | INFO | checkpoint_path: /mnt/personal/zhudongy/datacomp_results/small/low_inter_only/checkpoints
|
| 367 |
+
2025-04-29,10:46:54 | INFO | coca_caption_loss_weight: 2.0
|
| 368 |
+
2025-04-29,10:46:54 | INFO | coca_contrastive_loss_weight: 1.0
|
| 369 |
+
2025-04-29,10:46:54 | INFO | copy_codebase: False
|
| 370 |
+
2025-04-29,10:46:54 | INFO | csv_caption_key: title
|
| 371 |
+
2025-04-29,10:46:54 | INFO | csv_img_key: filepath
|
| 372 |
+
2025-04-29,10:46:54 | INFO | csv_separator:
|
| 373 |
+
2025-04-29,10:46:54 | INFO | dataset_resampled: True
|
| 374 |
+
2025-04-29,10:46:54 | INFO | dataset_type: webdataset
|
| 375 |
+
2025-04-29,10:46:54 | INFO | ddp_static_graph: True
|
| 376 |
+
2025-04-29,10:46:54 | INFO | debug: False
|
| 377 |
+
2025-04-29,10:46:54 | INFO | delete_previous_checkpoint: False
|
| 378 |
+
2025-04-29,10:46:54 | INFO | device: cuda:0
|
| 379 |
+
2025-04-29,10:46:54 | INFO | dist_backend: nccl
|
| 380 |
+
2025-04-29,10:46:54 | INFO | dist_url: env://
|
| 381 |
+
2025-04-29,10:46:54 | INFO | distill: False
|
| 382 |
+
2025-04-29,10:46:54 | INFO | distill_model: None
|
| 383 |
+
2025-04-29,10:46:54 | INFO | distill_pretrained: None
|
| 384 |
+
2025-04-29,10:46:54 | INFO | distributed: True
|
| 385 |
+
2025-04-29,10:46:54 | INFO | epochs: 8
|
| 386 |
+
2025-04-29,10:46:54 | INFO | epochs_cooldown: None
|
| 387 |
+
2025-04-29,10:46:54 | INFO | eps: 1e-06
|
| 388 |
+
2025-04-29,10:46:54 | INFO | force_custom_text: False
|
| 389 |
+
2025-04-29,10:46:54 | INFO | force_image_size: None
|
| 390 |
+
2025-04-29,10:46:54 | INFO | force_patch_dropout: None
|
| 391 |
+
2025-04-29,10:46:54 | INFO | force_quick_gelu: False
|
| 392 |
+
2025-04-29,10:46:54 | INFO | gather_with_grad: True
|
| 393 |
+
2025-04-29,10:46:54 | INFO | grad_checkpointing: True
|
| 394 |
+
2025-04-29,10:46:54 | INFO | grad_clip_norm: None
|
| 395 |
+
2025-04-29,10:46:54 | INFO | horovod: False
|
| 396 |
+
2025-04-29,10:46:54 | INFO | image_mean: None
|
| 397 |
+
2025-04-29,10:46:54 | INFO | image_std: None
|
| 398 |
+
2025-04-29,10:46:54 | INFO | imagenet_v2: None
|
| 399 |
+
2025-04-29,10:46:54 | INFO | imagenet_val: None
|
| 400 |
+
2025-04-29,10:46:54 | INFO | local_loss: True
|
| 401 |
+
2025-04-29,10:46:54 | INFO | local_rank: 0
|
| 402 |
+
2025-04-29,10:46:54 | INFO | lock_image: False
|
| 403 |
+
2025-04-29,10:46:54 | INFO | lock_image_freeze_bn_stats: False
|
| 404 |
+
2025-04-29,10:46:54 | INFO | lock_image_unlocked_groups: 0
|
| 405 |
+
2025-04-29,10:46:54 | INFO | lock_text: False
|
| 406 |
+
2025-04-29,10:46:54 | INFO | lock_text_freeze_layer_norm: False
|
| 407 |
+
2025-04-29,10:46:54 | INFO | lock_text_unlocked_layers: 0
|
| 408 |
+
2025-04-29,10:46:54 | INFO | log_every_n_steps: 100
|
| 409 |
+
2025-04-29,10:46:54 | INFO | log_level: 20
|
| 410 |
+
2025-04-29,10:46:54 | INFO | log_local: False
|
| 411 |
+
2025-04-29,10:46:54 | INFO | log_path: /mnt/personal/zhudongy/datacomp_results/small/low_inter_only/out.log
|
| 412 |
+
2025-04-29,10:46:54 | INFO | logs: /mnt/personal/zhudongy/datacomp_results/small
|
| 413 |
+
2025-04-29,10:46:54 | INFO | lr: 0.0005
|
| 414 |
+
2025-04-29,10:46:54 | INFO | lr_cooldown_end: 0.0
|
| 415 |
+
2025-04-29,10:46:54 | INFO | lr_cooldown_power: 1.0
|
| 416 |
+
2025-04-29,10:46:54 | INFO | lr_scheduler: cosine
|
| 417 |
+
2025-04-29,10:46:54 | INFO | model: ViT-B-32
|
| 418 |
+
2025-04-29,10:46:54 | INFO | name: low_inter_only
|
| 419 |
+
2025-04-29,10:46:54 | INFO | no_set_device_rank: False
|
| 420 |
+
2025-04-29,10:46:54 | INFO | precision: amp_bfloat16
|
| 421 |
+
2025-04-29,10:46:54 | INFO | pretrained:
|
| 422 |
+
2025-04-29,10:46:54 | INFO | pretrained_image: False
|
| 423 |
+
2025-04-29,10:46:54 | INFO | rank: 0
|
| 424 |
+
2025-04-29,10:46:54 | INFO | remote_sync: None
|
| 425 |
+
2025-04-29,10:46:54 | INFO | remote_sync_frequency: 300
|
| 426 |
+
2025-04-29,10:46:54 | INFO | remote_sync_protocol: s3
|
| 427 |
+
2025-04-29,10:46:54 | INFO | report_to:
|
| 428 |
+
2025-04-29,10:46:54 | INFO | resume: None
|
| 429 |
+
2025-04-29,10:46:54 | INFO | save_frequency: 0
|
| 430 |
+
2025-04-29,10:46:54 | INFO | save_most_recent: True
|
| 431 |
+
2025-04-29,10:46:54 | INFO | seed: 0
|
| 432 |
+
2025-04-29,10:46:54 | INFO | skip_scheduler: False
|
| 433 |
+
2025-04-29,10:46:54 | INFO | tensorboard: False
|
| 434 |
+
2025-04-29,10:46:54 | INFO | tensorboard_path:
|
| 435 |
+
2025-04-29,10:46:54 | INFO | torchscript: False
|
| 436 |
+
2025-04-29,10:46:54 | INFO | trace: False
|
| 437 |
+
2025-04-29,10:46:54 | INFO | train_data: /mnt/personal/zhudongy/datacomp-small/shards/0000{0000..1287}.tar
|
| 438 |
+
2025-04-29,10:46:54 | INFO | train_data_upsampling_factors: None
|
| 439 |
+
2025-04-29,10:46:54 | INFO | train_num_samples: 1600000
|
| 440 |
+
2025-04-29,10:46:54 | INFO | use_bn_sync: False
|
| 441 |
+
2025-04-29,10:46:54 | INFO | val_data: None
|
| 442 |
+
2025-04-29,10:46:54 | INFO | val_frequency: 1
|
| 443 |
+
2025-04-29,10:46:54 | INFO | val_num_samples: None
|
| 444 |
+
2025-04-29,10:46:54 | INFO | wandb: False
|
| 445 |
+
2025-04-29,10:46:54 | INFO | wandb_notes:
|
| 446 |
+
2025-04-29,10:46:54 | INFO | wandb_project_name: open-clip
|
| 447 |
+
2025-04-29,10:46:54 | INFO | warmup: 500
|
| 448 |
+
2025-04-29,10:46:54 | INFO | wd: 0.2
|
| 449 |
+
2025-04-29,10:46:54 | INFO | workers: 4
|
| 450 |
+
2025-04-29,10:46:54 | INFO | world_size: 2
|
| 451 |
+
2025-04-29,10:46:54 | INFO | zeroshot_frequency: 2
|
| 452 |
+
2025-04-29,10:46:54 | INFO | Start epoch 0
|
| 453 |
+
2025-04-29,10:47:09 | INFO | Train Epoch: 0 [ 4096/1605632 (0%)] Data (t): 12.367 Batch (t): 14.958, 273.829/s, 136.915/s/gpu LR: 0.000001 Logit Scale: 14.286 Contrastive_loss: 8.3764 (8.3764) Loss: 8.3764 (8.3764)
|
| 454 |
+
2025-04-29,10:47:11 | INFO | Reducer buckets have been rebuilt in this iteration.
|
| 455 |
+
2025-04-29,10:51:10 | INFO | Train Epoch: 0 [ 413696/1605632 (26%)] Data (t): 0.340 Batch (t): 2.410, 1686.32/s, 843.158/s/gpu LR: 0.000101 Logit Scale: 14.265 Contrastive_loss: 8.2427 (8.3096) Loss: 8.2427 (8.3096)
|
| 456 |
+
2025-04-29,10:55:13 | INFO | Train Epoch: 0 [ 823296/1605632 (51%)] Data (t): 0.296 Batch (t): 2.436, 1684.03/s, 842.017/s/gpu LR: 0.000201 Logit Scale: 14.243 Contrastive_loss: 8.1236 (8.2476) Loss: 8.1236 (8.2476)
|
| 457 |
+
2025-04-29,10:59:20 | INFO | Train Epoch: 0 [1232896/1605632 (77%)] Data (t): 0.357 Batch (t): 2.468, 1624.40/s, 812.201/s/gpu LR: 0.000301 Logit Scale: 14.221 Contrastive_loss: 7.9420 (8.1712) Loss: 7.9420 (8.1712)
|
| 458 |
+
2025-04-29,11:03:10 | INFO | Train Epoch: 0 [1605632/1605632 (100%)] Data (t): 0.417 Batch (t): 2.530, 1728.75/s, 864.375/s/gpu LR: 0.000392 Logit Scale: 14.198 Contrastive_loss: 7.8148 (8.0999) Loss: 7.8148 (8.0999)
|
| 459 |
+
2025-04-29,11:03:12 | INFO | Start epoch 1
|
| 460 |
+
2025-04-29,11:03:26 | INFO | Train Epoch: 1 [ 4096/1605632 (0%)] Data (t): 11.181 Batch (t): 13.231, 309.584/s, 154.792/s/gpu LR: 0.000393 Logit Scale: 14.198 Contrastive_loss: 7.8044 (7.8044) Loss: 7.8044 (7.8044)
|
| 461 |
+
2025-04-29,11:07:36 | INFO | Train Epoch: 1 [ 413696/1605632 (26%)] Data (t): 0.380 Batch (t): 2.507, 1574.67/s, 787.334/s/gpu LR: 0.000493 Logit Scale: 14.191 Contrastive_loss: 7.7343 (7.7694) Loss: 7.7343 (7.7694)
|
| 462 |
+
2025-04-29,11:11:47 | INFO | Train Epoch: 1 [ 823296/1605632 (51%)] Data (t): 0.374 Batch (t): 2.502, 1134.29/s, 567.146/s/gpu LR: 0.000498 Logit Scale: 14.212 Contrastive_loss: 7.6846 (7.7411) Loss: 7.6846 (7.7411)
|
| 463 |
+
2025-04-29,11:16:16 | INFO | Train Epoch: 1 [1232896/1605632 (77%)] Data (t): 0.619 Batch (t): 2.694, 1629.58/s, 814.789/s/gpu LR: 0.000493 Logit Scale: 14.257 Contrastive_loss: 7.5485 (7.6930) Loss: 7.5485 (7.6930)
|
| 464 |
+
2025-04-29,11:20:17 | INFO | Train Epoch: 1 [1605632/1605632 (100%)] Data (t): 0.482 Batch (t): 2.646, 1703.80/s, 851.899/s/gpu LR: 0.000486 Logit Scale: 14.345 Contrastive_loss: 7.3599 (7.6263) Loss: 7.3599 (7.6263)
|
| 465 |
+
2025-04-29,11:20:19 | INFO | Start epoch 2
|
| 466 |
+
2025-04-29,11:20:31 | INFO | Train Epoch: 2 [ 4096/1605632 (0%)] Data (t): 10.642 Batch (t): 12.727, 321.831/s, 160.916/s/gpu LR: 0.000486 Logit Scale: 14.347 Contrastive_loss: 7.2405 (7.2405) Loss: 7.2405 (7.2405)
|
| 467 |
+
2025-04-29,11:24:41 | INFO | Train Epoch: 2 [ 413696/1605632 (26%)] Data (t): 0.355 Batch (t): 2.492, 1714.72/s, 857.358/s/gpu LR: 0.000474 Logit Scale: 14.450 Contrastive_loss: 7.4679 (7.3542) Loss: 7.4679 (7.3542)
|
| 468 |
+
2025-04-29,11:28:51 | INFO | Train Epoch: 2 [ 823296/1605632 (51%)] Data (t): 0.408 Batch (t): 2.510, 1154.44/s, 577.221/s/gpu LR: 0.000460 Logit Scale: 14.545 Contrastive_loss: 7.2964 (7.3349) Loss: 7.2964 (7.3349)
|
| 469 |
+
2025-04-29,11:33:00 | INFO | Train Epoch: 2 [1232896/1605632 (77%)] Data (t): 0.394 Batch (t): 2.484, 1699.79/s, 849.896/s/gpu LR: 0.000442 Logit Scale: 14.677 Contrastive_loss: 7.1794 (7.2960) Loss: 7.1794 (7.2960)
|
| 470 |
+
2025-04-29,11:36:40 | INFO | Train Epoch: 2 [1605632/1605632 (100%)] Data (t): 0.343 Batch (t): 2.421, 1718.76/s, 859.379/s/gpu LR: 0.000423 Logit Scale: 14.812 Contrastive_loss: 7.2101 (7.2789) Loss: 7.2101 (7.2789)
|
| 471 |
+
2025-04-29,11:36:42 | INFO | Start epoch 3
|
| 472 |
+
2025-04-29,11:36:55 | INFO | Train Epoch: 3 [ 4096/1605632 (0%)] Data (t): 10.902 Batch (t): 12.965, 315.925/s, 157.962/s/gpu LR: 0.000423 Logit Scale: 14.814 Contrastive_loss: 7.1536 (7.1536) Loss: 7.1536 (7.1536)
|
| 473 |
+
2025-04-29,11:41:22 | INFO | Train Epoch: 3 [ 413696/1605632 (26%)] Data (t): 0.491 Batch (t): 2.665, 1701.82/s, 850.912/s/gpu LR: 0.000400 Logit Scale: 14.984 Contrastive_loss: 7.1237 (7.1387) Loss: 7.1237 (7.1387)
|
| 474 |
+
2025-04-29,11:45:33 | INFO | Train Epoch: 3 [ 823296/1605632 (51%)] Data (t): 0.387 Batch (t): 2.511, 1014.82/s, 507.408/s/gpu LR: 0.000376 Logit Scale: 15.179 Contrastive_loss: 6.8644 (7.0472) Loss: 6.8644 (7.0472)
|
| 475 |
+
2025-04-29,11:49:47 | INFO | Train Epoch: 3 [1232896/1605632 (77%)] Data (t): 0.452 Batch (t): 2.541, 1693.74/s, 846.870/s/gpu LR: 0.000349 Logit Scale: 15.355 Contrastive_loss: 6.6430 (6.9462) Loss: 6.6430 (6.9462)
|
| 476 |
+
2025-04-29,11:53:29 | INFO | Train Epoch: 3 [1605632/1605632 (100%)] Data (t): 0.371 Batch (t): 2.446, 1680.22/s, 840.108/s/gpu LR: 0.000324 Logit Scale: 15.544 Contrastive_loss: 6.8729 (6.9315) Loss: 6.8729 (6.9315)
|
| 477 |
+
2025-04-29,11:53:31 | INFO | Start epoch 4
|
| 478 |
+
2025-04-29,11:53:45 | INFO | Train Epoch: 4 [ 4096/1605632 (0%)] Data (t): 11.803 Batch (t): 13.888, 294.923/s, 147.461/s/gpu LR: 0.000323 Logit Scale: 15.546 Contrastive_loss: 6.8561 (6.8561) Loss: 6.8561 (6.8561)
|
| 479 |
+
2025-04-29,11:57:50 | INFO | Train Epoch: 4 [ 413696/1605632 (26%)] Data (t): 0.373 Batch (t): 2.446, 1742.13/s, 871.064/s/gpu LR: 0.000294 Logit Scale: 15.744 Contrastive_loss: 6.6737 (6.7649) Loss: 6.6737 (6.7649)
|
| 480 |
+
2025-04-29,12:01:49 | INFO | Train Epoch: 4 [ 823296/1605632 (51%)] Data (t): 0.263 Batch (t): 2.394, 1726.45/s, 863.225/s/gpu LR: 0.000265 Logit Scale: 15.914 Contrastive_loss: 6.7540 (6.7613) Loss: 6.7540 (6.7613)
|
| 481 |
+
2025-04-29,12:05:52 | INFO | Train Epoch: 4 [1232896/1605632 (77%)] Data (t): 0.278 Batch (t): 2.430, 1668.21/s, 834.103/s/gpu LR: 0.000235 Logit Scale: 16.076 Contrastive_loss: 6.7919 (6.7689) Loss: 6.7919 (6.7689)
|
| 482 |
+
2025-04-29,12:09:34 | INFO | Train Epoch: 4 [1605632/1605632 (100%)] Data (t): 0.279 Batch (t): 2.440, 1555.69/s, 777.845/s/gpu LR: 0.000208 Logit Scale: 16.253 Contrastive_loss: 6.6502 (6.7452) Loss: 6.6502 (6.7452)
|
| 483 |
+
2025-04-29,12:09:36 | INFO | Start epoch 5
|
| 484 |
+
2025-04-29,12:09:50 | INFO | Train Epoch: 5 [ 4096/1605632 (0%)] Data (t): 11.275 Batch (t): 13.347, 306.892/s, 153.446/s/gpu LR: 0.000208 Logit Scale: 16.255 Contrastive_loss: 6.6999 (6.6999) Loss: 6.6999 (6.6999)
|
| 485 |
+
2025-04-29,12:14:05 | INFO | Train Epoch: 5 [ 413696/1605632 (26%)] Data (t): 0.431 Batch (t): 2.551, 1723.17/s, 861.587/s/gpu LR: 0.000179 Logit Scale: 16.426 Contrastive_loss: 6.5666 (6.6333) Loss: 6.5666 (6.6333)
|
| 486 |
+
2025-04-29,12:18:18 | INFO | Train Epoch: 5 [ 823296/1605632 (51%)] Data (t): 0.404 Batch (t): 2.528, 1332.54/s, 666.268/s/gpu LR: 0.000151 Logit Scale: 16.569 Contrastive_loss: 6.4771 (6.5812) Loss: 6.4771 (6.5812)
|
| 487 |
+
2025-04-29,12:22:19 | INFO | Train Epoch: 5 [1232896/1605632 (77%)] Data (t): 0.336 Batch (t): 2.412, 1709.31/s, 854.657/s/gpu LR: 0.000124 Logit Scale: 16.692 Contrastive_loss: 6.5475 (6.5728) Loss: 6.5475 (6.5728)
|
| 488 |
+
2025-04-29,12:25:58 | INFO | Train Epoch: 5 [1605632/1605632 (100%)] Data (t): 0.336 Batch (t): 2.405, 1729.43/s, 864.715/s/gpu LR: 0.000102 Logit Scale: 16.794 Contrastive_loss: 6.5236 (6.5629) Loss: 6.5236 (6.5629)
|
| 489 |
+
2025-04-29,12:26:00 | INFO | Start epoch 6
|
| 490 |
+
2025-04-29,12:26:13 | INFO | Train Epoch: 6 [ 4096/1605632 (0%)] Data (t): 11.106 Batch (t): 13.199, 310.322/s, 155.161/s/gpu LR: 0.000101 Logit Scale: 16.795 Contrastive_loss: 6.4957 (6.4957) Loss: 6.4957 (6.4957)
|
| 491 |
+
2025-04-29,12:30:20 | INFO | Train Epoch: 6 [ 413696/1605632 (26%)] Data (t): 0.374 Batch (t): 2.472, 1508.63/s, 754.316/s/gpu LR: 0.000079 Logit Scale: 16.882 Contrastive_loss: 6.0769 (6.2863) Loss: 6.0769 (6.2863)
|
| 492 |
+
2025-04-29,12:34:22 | INFO | Train Epoch: 6 [ 823296/1605632 (51%)] Data (t): 0.286 Batch (t): 2.416, 1530.56/s, 765.282/s/gpu LR: 0.000058 Logit Scale: 16.958 Contrastive_loss: 6.2138 (6.2621) Loss: 6.2138 (6.2621)
|
| 493 |
+
2025-04-29,12:38:24 | INFO | Train Epoch: 6 [1232896/1605632 (77%)] Data (t): 0.277 Batch (t): 2.422, 1757.06/s, 878.529/s/gpu LR: 0.000040 Logit Scale: 17.014 Contrastive_loss: 6.1187 (6.2263) Loss: 6.1187 (6.2263)
|
| 494 |
+
2025-04-29,12:42:22 | INFO | Train Epoch: 6 [1605632/1605632 (100%)] Data (t): 0.538 Batch (t): 2.620, 1698.75/s, 849.373/s/gpu LR: 0.000027 Logit Scale: 17.051 Contrastive_loss: 6.2108 (6.2232) Loss: 6.2108 (6.2232)
|
| 495 |
+
2025-04-29,12:42:24 | INFO | Start epoch 7
|
| 496 |
+
2025-04-29,12:42:38 | INFO | Train Epoch: 7 [ 4096/1605632 (0%)] Data (t): 11.138 Batch (t): 13.217, 309.905/s, 154.953/s/gpu LR: 0.000027 Logit Scale: 17.051 Contrastive_loss: 6.2225 (6.2225) Loss: 6.2225 (6.2225)
|
| 497 |
+
2025-04-29,12:46:42 | INFO | Train Epoch: 7 [ 413696/1605632 (26%)] Data (t): 0.329 Batch (t): 2.443, 1574.65/s, 787.324/s/gpu LR: 0.000015 Logit Scale: 17.075 Contrastive_loss: 6.2199 (6.2212) Loss: 6.2199 (6.2212)
|
| 498 |
+
2025-04-29,12:50:48 | INFO | Train Epoch: 7 [ 823296/1605632 (51%)] Data (t): 0.363 Batch (t): 2.459, 1682.10/s, 841.051/s/gpu LR: 0.000007 Logit Scale: 17.089 Contrastive_loss: 6.2006 (6.2143) Loss: 6.2006 (6.2143)
|
| 499 |
+
2025-04-29,12:54:50 | INFO | Train Epoch: 7 [1232896/1605632 (77%)] Data (t): 0.336 Batch (t): 2.418, 1614.35/s, 807.177/s/gpu LR: 0.000002 Logit Scale: 17.093 Contrastive_loss: 6.0595 (6.1756) Loss: 6.0595 (6.1756)
|
| 500 |
+
2025-04-29,12:58:34 | INFO | Train Epoch: 7 [1605632/1605632 (100%)] Data (t): 0.342 Batch (t): 2.461, 1719.27/s, 859.635/s/gpu LR: 0.000000 Logit Scale: 17.094 Contrastive_loss: 6.1999 (6.1805) Loss: 6.1999 (6.1805)
|
small/low_inter_only/params.txt
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accum_freq: 1
|
| 2 |
+
aug_cfg: {}
|
| 3 |
+
batch_size: 2048
|
| 4 |
+
beta1: 0.9
|
| 5 |
+
beta2: 0.98
|
| 6 |
+
checkpoint_path: /mnt/personal/zhudongy/datacomp_results/small/low_inter_only/checkpoints
|
| 7 |
+
coca_caption_loss_weight: 2.0
|
| 8 |
+
coca_contrastive_loss_weight: 1.0
|
| 9 |
+
copy_codebase: False
|
| 10 |
+
csv_caption_key: title
|
| 11 |
+
csv_img_key: filepath
|
| 12 |
+
csv_separator:
|
| 13 |
+
dataset_resampled: True
|
| 14 |
+
dataset_type: webdataset
|
| 15 |
+
ddp_static_graph: True
|
| 16 |
+
debug: False
|
| 17 |
+
delete_previous_checkpoint: False
|
| 18 |
+
device: cuda:0
|
| 19 |
+
dist_backend: nccl
|
| 20 |
+
dist_url: env://
|
| 21 |
+
distill: False
|
| 22 |
+
distill_model: None
|
| 23 |
+
distill_pretrained: None
|
| 24 |
+
distributed: True
|
| 25 |
+
epochs: 8
|
| 26 |
+
epochs_cooldown: None
|
| 27 |
+
eps: 1e-06
|
| 28 |
+
force_custom_text: False
|
| 29 |
+
force_image_size: None
|
| 30 |
+
force_patch_dropout: None
|
| 31 |
+
force_quick_gelu: False
|
| 32 |
+
gather_with_grad: True
|
| 33 |
+
grad_checkpointing: True
|
| 34 |
+
grad_clip_norm: None
|
| 35 |
+
horovod: False
|
| 36 |
+
image_mean: None
|
| 37 |
+
image_std: None
|
| 38 |
+
imagenet_v2: None
|
| 39 |
+
imagenet_val: None
|
| 40 |
+
local_loss: True
|
| 41 |
+
local_rank: 0
|
| 42 |
+
lock_image: False
|
| 43 |
+
lock_image_freeze_bn_stats: False
|
| 44 |
+
lock_image_unlocked_groups: 0
|
| 45 |
+
lock_text: False
|
| 46 |
+
lock_text_freeze_layer_norm: False
|
| 47 |
+
lock_text_unlocked_layers: 0
|
| 48 |
+
log_every_n_steps: 100
|
| 49 |
+
log_level: 20
|
| 50 |
+
log_local: False
|
| 51 |
+
log_path: /mnt/personal/zhudongy/datacomp_results/small/low_inter_only/out.log
|
| 52 |
+
logs: /mnt/personal/zhudongy/datacomp_results/small
|
| 53 |
+
lr: 0.0005
|
| 54 |
+
lr_cooldown_end: 0.0
|
| 55 |
+
lr_cooldown_power: 1.0
|
| 56 |
+
lr_scheduler: cosine
|
| 57 |
+
model: ViT-B-32
|
| 58 |
+
name: low_inter_only
|
| 59 |
+
no_set_device_rank: False
|
| 60 |
+
precision: amp_bfloat16
|
| 61 |
+
pretrained:
|
| 62 |
+
pretrained_image: False
|
| 63 |
+
rank: 0
|
| 64 |
+
remote_sync: None
|
| 65 |
+
remote_sync_frequency: 300
|
| 66 |
+
remote_sync_protocol: s3
|
| 67 |
+
report_to:
|
| 68 |
+
resume: None
|
| 69 |
+
save_frequency: 0
|
| 70 |
+
save_most_recent: True
|
| 71 |
+
seed: 0
|
| 72 |
+
skip_scheduler: False
|
| 73 |
+
tensorboard: False
|
| 74 |
+
tensorboard_path:
|
| 75 |
+
torchscript: False
|
| 76 |
+
trace: False
|
| 77 |
+
train_data: /mnt/personal/zhudongy/datacomp-small/shards/0000{0000..1287}.tar
|
| 78 |
+
train_data_upsampling_factors: None
|
| 79 |
+
train_num_samples: 1600000
|
| 80 |
+
use_bn_sync: False
|
| 81 |
+
val_data: None
|
| 82 |
+
val_frequency: 1
|
| 83 |
+
val_num_samples: None
|
| 84 |
+
wandb: False
|
| 85 |
+
wandb_notes:
|
| 86 |
+
wandb_project_name: open-clip
|
| 87 |
+
warmup: 500
|
| 88 |
+
wd: 0.2
|
| 89 |
+
workers: 4
|
| 90 |
+
world_size: 2
|
| 91 |
+
zeroshot_frequency: 2
|