add remote code + model files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .hydra/config.yaml +1 -1
- checkpoints/step-000000209715200.pt +2 -2
- checkpoints/step-000000419430400.pt +2 -2
- checkpoints/step-000000629145600.pt +2 -2
- checkpoints/step-000000838860800.pt +2 -2
- checkpoints/step-000001048576000.pt +2 -2
- checkpoints/step-000001258291200.pt +2 -2
- checkpoints/step-000001468006400.pt +2 -2
- checkpoints/step-000001677721600.pt +2 -2
- checkpoints/step-000001887436800.pt +2 -2
- config.yaml +1 -1
- decay_params.txt +13 -13
- logs/2025-10-26_21-16-14.log +258 -0
- metrics/jsonlines/checkpoint.jsonl +9 -9
- metrics/jsonlines/norm.jsonl +0 -0
- metrics/jsonlines/throughput.jsonl +0 -0
- metrics/jsonlines/train.jsonl +98 -98
- metrics/jsonlines/train_eval.jsonl +19 -19
- metrics/jsonlines/val.jsonl +49 -49
- metrics/npz/train_eval/step-000000104857600.npz +1 -1
- metrics/npz/train_eval/step-000000209715200.npz +1 -1
- metrics/npz/train_eval/step-000000314572800.npz +1 -1
- metrics/npz/train_eval/step-000000419430400.npz +1 -1
- metrics/npz/train_eval/step-000000524288000.npz +1 -1
- metrics/npz/train_eval/step-000000629145600.npz +1 -1
- metrics/npz/train_eval/step-000000734003200.npz +1 -1
- metrics/npz/train_eval/step-000000838860800.npz +1 -1
- metrics/npz/train_eval/step-000000943718400.npz +1 -1
- metrics/npz/train_eval/step-000001048576000.npz +1 -1
- metrics/npz/train_eval/step-000001153433600.npz +1 -1
- metrics/npz/train_eval/step-000001258291200.npz +1 -1
- metrics/npz/train_eval/step-000001363148800.npz +1 -1
- metrics/npz/train_eval/step-000001468006400.npz +1 -1
- metrics/npz/train_eval/step-000001572864000.npz +1 -1
- metrics/npz/train_eval/step-000001677721600.npz +1 -1
- metrics/npz/train_eval/step-000001782579200.npz +1 -1
- metrics/npz/train_eval/step-000001887436800.npz +1 -1
- metrics/npz/train_eval/step-000001992294400.npz +1 -1
- metrics/npz/val/step-000000041943040.npz +1 -1
- metrics/npz/val/step-000000083886080.npz +1 -1
- metrics/npz/val/step-000000125829120.npz +1 -1
- metrics/npz/val/step-000000167772160.npz +1 -1
- metrics/npz/val/step-000000209715200.npz +1 -1
- metrics/npz/val/step-000000251658240.npz +1 -1
- metrics/npz/val/step-000000293601280.npz +1 -1
- metrics/npz/val/step-000000335544320.npz +1 -1
- metrics/npz/val/step-000000377487360.npz +1 -1
- metrics/npz/val/step-000000419430400.npz +1 -1
- metrics/npz/val/step-000000461373440.npz +1 -1
- metrics/npz/val/step-000000503316480.npz +1 -1
.hydra/config.yaml
CHANGED
|
@@ -81,7 +81,7 @@ train:
|
|
| 81 |
max_tokens: 2097152000
|
| 82 |
grad_acc_tokens: 32768
|
| 83 |
max_grad_norm: 1.0
|
| 84 |
-
gradient_checkpointing:
|
| 85 |
bias_weight_decay: false
|
| 86 |
normalization_weight_decay: false
|
| 87 |
conv_weight_decay: true
|
|
|
|
| 81 |
max_tokens: 2097152000
|
| 82 |
grad_acc_tokens: 32768
|
| 83 |
max_grad_norm: 1.0
|
| 84 |
+
gradient_checkpointing: false
|
| 85 |
bias_weight_decay: false
|
| 86 |
normalization_weight_decay: false
|
| 87 |
conv_weight_decay: true
|
checkpoints/step-000000209715200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9e45d1cf8fcb47d3de97c6d87e5f89f3999fa51cf1153d98e06ddd01738884a
|
| 3 |
+
size 329409794
|
checkpoints/step-000000419430400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:76200303f1e014e549314ee020fa0e5d22b5df5ab722b78939721493230cd0e9
|
| 3 |
+
size 329409794
|
checkpoints/step-000000629145600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0edce8777efa82db6db58947a86af1e70965bcd8111d157ab482d93509e950ae
|
| 3 |
+
size 329409794
|
checkpoints/step-000000838860800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5669d373118b51004bf8aea4fe7a13ee19dbdc68f9e312defe7b150448fe71b1
|
| 3 |
+
size 329409794
|
checkpoints/step-000001048576000.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee4dbd5ba0786230f8dc3da5b8d9004fc0a397ba0c66cb84281ac680baecca2d
|
| 3 |
+
size 329409794
|
checkpoints/step-000001258291200.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f97eccdcb9b8e784360b41031ee52a305bbfa2f5f86aded5ce80cd7ba2f8fa26
|
| 3 |
+
size 329409794
|
checkpoints/step-000001468006400.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9966465586a4d501839d48e7386d69869fbc6a1c9d5fb1b6f332d41f5b76b2b
|
| 3 |
+
size 329409794
|
checkpoints/step-000001677721600.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:812207ef44485fecc552a69460100dd3edeb92b60f525c24ec4731075d854566
|
| 3 |
+
size 329409794
|
checkpoints/step-000001887436800.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12204d8faa3d1b317437b6a40475416449528599d5a3e0eabd4f077a5f4c8544
|
| 3 |
+
size 329409794
|
config.yaml
CHANGED
|
@@ -81,7 +81,7 @@ train:
|
|
| 81 |
max_tokens: 2097152000
|
| 82 |
grad_acc_tokens: 32768
|
| 83 |
max_grad_norm: 1.0
|
| 84 |
-
gradient_checkpointing:
|
| 85 |
bias_weight_decay: false
|
| 86 |
normalization_weight_decay: false
|
| 87 |
conv_weight_decay: true
|
|
|
|
| 81 |
max_tokens: 2097152000
|
| 82 |
grad_acc_tokens: 32768
|
| 83 |
max_grad_norm: 1.0
|
| 84 |
+
gradient_checkpointing: false
|
| 85 |
bias_weight_decay: false
|
| 86 |
normalization_weight_decay: false
|
| 87 |
conv_weight_decay: true
|
decay_params.txt
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
-
_forward_module._fsdp_wrapped_module.
|
| 2 |
-
_forward_module._fsdp_wrapped_module.
|
| 3 |
-
_forward_module._fsdp_wrapped_module.
|
| 4 |
-
_forward_module._fsdp_wrapped_module.
|
| 5 |
-
_forward_module._fsdp_wrapped_module.
|
| 6 |
-
_forward_module._fsdp_wrapped_module.
|
| 7 |
-
_forward_module._fsdp_wrapped_module.
|
| 8 |
-
_forward_module._fsdp_wrapped_module.
|
| 9 |
-
_forward_module._fsdp_wrapped_module.
|
| 10 |
-
_forward_module._fsdp_wrapped_module.
|
| 11 |
-
_forward_module._fsdp_wrapped_module.
|
| 12 |
-
_forward_module._fsdp_wrapped_module.
|
| 13 |
-
_forward_module._fsdp_wrapped_module.
|
| 14 |
_forward_module._fsdp_wrapped_module.lm_head.weight
|
|
|
|
| 1 |
+
_forward_module._fsdp_wrapped_module.emb.weight
|
| 2 |
+
_forward_module._fsdp_wrapped_module.layers.0.attn.q_proj.weight
|
| 3 |
+
_forward_module._fsdp_wrapped_module.layers.0.attn.k_proj.weight
|
| 4 |
+
_forward_module._fsdp_wrapped_module.layers.0.attn.v_proj.weight
|
| 5 |
+
_forward_module._fsdp_wrapped_module.layers.0.attn.o_proj.weight
|
| 6 |
+
_forward_module._fsdp_wrapped_module.layers.0.mlp.gate_proj.weight
|
| 7 |
+
_forward_module._fsdp_wrapped_module.layers.0.mlp.down_proj.weight
|
| 8 |
+
_forward_module._fsdp_wrapped_module.layers.1.attn.q_proj.weight
|
| 9 |
+
_forward_module._fsdp_wrapped_module.layers.1.attn.k_proj.weight
|
| 10 |
+
_forward_module._fsdp_wrapped_module.layers.1.attn.v_proj.weight
|
| 11 |
+
_forward_module._fsdp_wrapped_module.layers.1.attn.o_proj.weight
|
| 12 |
+
_forward_module._fsdp_wrapped_module.layers.1.mlp.gate_proj.weight
|
| 13 |
+
_forward_module._fsdp_wrapped_module.layers.1.mlp.down_proj.weight
|
| 14 |
_forward_module._fsdp_wrapped_module.lm_head.weight
|
logs/2025-10-26_21-16-14.log
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2025-10-26 21:16:14][train:372][INFO] All outputs will be saved to `/workspace/forgetting-transformer/alibi_2_4_256`
|
| 2 |
+
[2025-10-26 21:16:14][train:375][INFO] Configuration:
|
| 3 |
+
[2025-10-26 21:16:14][train:380][INFO] Configuration saved to /workspace/forgetting-transformer/alibi_2_4_256/config.yaml.
|
| 4 |
+
[2025-10-26 21:16:14][train:387][INFO] creating datamodule
|
| 5 |
+
[2025-10-26 21:16:14][train:419][INFO] creating model
|
| 6 |
+
[2025-10-26 21:16:15][train:440][INFO] creating optimizer
|
| 7 |
+
[2025-10-26 21:16:15][checkpoint:39][INFO] Not resuming. Deleting existing checkpoints...
|
| 8 |
+
[2025-10-26 21:16:15][logger:256][INFO] Setting up wandb logger...
|
| 9 |
+
[2025-10-26 21:16:15][logger:272][INFO] Not resuming. Creating a new wandb run.
|
| 10 |
+
[2025-10-26 21:16:16][logger:288][INFO] wandb initialized. Run id: pun8f82u
|
| 11 |
+
[2025-10-26 21:16:16][logger:186][INFO] Setting up jsonlines logger...
|
| 12 |
+
[2025-10-26 21:16:16][logger:113][INFO] Setting up npz logger...
|
| 13 |
+
[2025-10-26 21:16:16][logger:171][INFO] [step: 0] [train_data_info/vocab_size: 50277] [train_data_info/global_tokens_per_batch: 2097152] [train_data_info/local_tokens_per_batch: 2097152] [train_data_info/batch_len: 2048] [train_data_info/seq_len: 2048] [train_data_info/total_tokens: 2055208960] [train_data_info/global_batch_size: 1024] [train_data_info/local_batch_size: 1024]
|
| 14 |
+
[2025-10-26 21:16:16][logger:171][INFO] [step: 0] [val_data_info/vocab_size: 50277] [val_data_info/global_tokens_per_batch: 2048] [val_data_info/local_tokens_per_batch: 2048] [val_data_info/batch_len: 2048] [val_data_info/seq_len: 2048] [val_data_info/total_tokens: 2147483648] [val_data_info/global_batch_size: 1] [val_data_info/local_batch_size: 1]
|
| 15 |
+
[2025-10-26 21:16:16][logger:171][INFO] [step: 0] [model_info/total_params: 27447040] [model_info/trainable_params: 27447040] [model_info/embedding_params: 12870912] [model_info/flops_per_token: 0] [model_info/non_embedding_params: 14576128]
|
| 16 |
+
[2025-10-26 21:17:13][utils:57][INFO] [P: 1.00%] [S: 20971520/2097152000] [T: 0:00:57] [ETA: 1:34:15] [loss: 10.077] [tokens/s: 392645.003] [batches/s: 0.187] [MFU: 0.000] [TFLOPS: 0.000]
|
| 17 |
+
[2025-10-26 21:18:06][utils:57][INFO] [P: 2.00%] [S: 41943040/2097152000] [T: 0:01:50] [ETA: 1:30:15] [loss: 8.170] [tokens/s: 392713.958] [batches/s: 0.187] [MFU: 0.000] [TFLOPS: 0.000]
|
| 18 |
+
[2025-10-26 21:18:06][train:194][INFO] Running validation...
|
| 19 |
+
[2025-10-26 21:19:46][logger:171][INFO] [step: 41943040] [val/train_token_count: 41943040] [val/train_batch_count: 20] [val/train_flop_count: 0] [val/train_total_time: 110.518] [val/train_update_time: 110.195] [val/loss: 8.073] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.972] [val/val_tokens_per_second: 409716.276] [val/loss_avg_len_2048: 8.073] [val/perplexity_len_2048: 3205.650] [val/loss_avg_len_1024: 8.071] [val/perplexity_len_1024: 3201.383] [val/loss_avg_len_512: 8.072] [val/perplexity_len_512: 3203.464]
|
| 20 |
+
[2025-10-26 21:20:40][utils:57][INFO] [P: 3.00%] [S: 62914560/2097152000] [T: 0:04:23] [ETA: 2:22:11] [loss: 7.760] [tokens/s: 238672.825] [batches/s: 0.114] [MFU: 0.000] [TFLOPS: 0.000]
|
| 21 |
+
[2025-10-26 21:21:33][utils:57][INFO] [P: 4.00%] [S: 83886080/2097152000] [T: 0:05:17] [ETA: 2:06:53] [loss: 7.535] [tokens/s: 265381.208] [batches/s: 0.127] [MFU: 0.000] [TFLOPS: 0.000]
|
| 22 |
+
[2025-10-26 21:21:33][train:194][INFO] Running validation...
|
| 23 |
+
[2025-10-26 21:23:13][logger:171][INFO] [step: 83886080] [val/train_token_count: 83886080] [val/train_batch_count: 40] [val/train_flop_count: 0] [val/train_total_time: 317.249] [val/train_update_time: 216.731] [val/loss: 7.520] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.601] [val/val_tokens_per_second: 411240.277] [val/loss_avg_len_2048: 7.520] [val/perplexity_len_2048: 1844.219] [val/loss_avg_len_1024: 7.521] [val/perplexity_len_1024: 1846.058] [val/loss_avg_len_512: 7.526] [val/perplexity_len_512: 1855.284]
|
| 24 |
+
[2025-10-26 21:24:06][utils:57][INFO] [P: 5.00%] [S: 104857600/2097152000] [T: 0:07:50] [ETA: 2:28:54] [loss: 7.356] [tokens/s: 222818.512] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 25 |
+
[2025-10-26 21:24:06][logger:171][INFO] [step: 104857600] [train_eval/train_token_count: 104857600] [train_eval/train_batch_count: 50] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 470.240] [train_eval/train_update_time: 270.009] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 8.521] [train_eval/perplexity_len_2048: 5019.001] [train_eval/loss_avg_len_1024: 8.522] [train_eval/perplexity_len_1024: 5026.594] [train_eval/loss_avg_len_512: 8.524] [train_eval/perplexity_len_512: 5034.671]
|
| 26 |
+
[2025-10-26 21:24:59][utils:57][INFO] [P: 6.00%] [S: 125829120/2097152000] [T: 0:08:43] [ETA: 2:16:43] [loss: 7.169] [tokens/s: 240455.609] [batches/s: 0.115] [MFU: 0.000] [TFLOPS: 0.000]
|
| 27 |
+
[2025-10-26 21:24:59][train:194][INFO] Running validation...
|
| 28 |
+
[2025-10-26 21:26:39][logger:171][INFO] [step: 125829120] [val/train_token_count: 125829120] [val/train_batch_count: 60] [val/train_flop_count: 0] [val/train_total_time: 523.628] [val/train_update_time: 323.282] [val/loss: 7.165] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.442] [val/val_tokens_per_second: 411897.959] [val/loss_avg_len_2048: 7.165] [val/perplexity_len_2048: 1292.821] [val/loss_avg_len_1024: 7.167] [val/perplexity_len_1024: 1295.904] [val/loss_avg_len_512: 7.175] [val/perplexity_len_512: 1306.548]
|
| 29 |
+
[2025-10-26 21:27:32][utils:57][INFO] [P: 7.00%] [S: 146800640/2097152000] [T: 0:11:16] [ETA: 2:29:47] [loss: 7.043] [tokens/s: 216814.165] [batches/s: 0.103] [MFU: 0.000] [TFLOPS: 0.000]
|
| 30 |
+
[2025-10-26 21:28:26][utils:57][INFO] [P: 8.00%] [S: 167772160/2097152000] [T: 0:12:09] [ETA: 2:19:53] [loss: 6.880] [tokens/s: 229852.701] [batches/s: 0.110] [MFU: 0.000] [TFLOPS: 0.000]
|
| 31 |
+
[2025-10-26 21:28:26][train:194][INFO] Running validation...
|
| 32 |
+
[2025-10-26 21:30:06][logger:171][INFO] [step: 167772160] [val/train_token_count: 167772160] [val/train_batch_count: 80] [val/train_flop_count: 0] [val/train_total_time: 729.843] [val/train_update_time: 429.837] [val/loss: 6.866] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.848] [val/val_tokens_per_second: 406155.409] [val/loss_avg_len_2048: 6.866] [val/perplexity_len_2048: 959.019] [val/loss_avg_len_1024: 6.870] [val/perplexity_len_1024: 963.405] [val/loss_avg_len_512: 6.883] [val/perplexity_len_512: 975.280]
|
| 33 |
+
[2025-10-26 21:31:00][utils:57][INFO] [P: 9.00%] [S: 188743680/2097152000] [T: 0:14:44] [ETA: 2:28:59] [loss: 6.733] [tokens/s: 213304.064] [batches/s: 0.102] [MFU: 0.000] [TFLOPS: 0.000]
|
| 34 |
+
[2025-10-26 21:31:53][utils:57][INFO] [P: 10.00%] [S: 209715200/2097152000] [T: 0:15:37] [ETA: 2:20:37] [loss: 6.633] [tokens/s: 223627.540] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 35 |
+
[2025-10-26 21:31:53][logger:171][INFO] [step: 209715200] [train_eval/train_token_count: 209715200] [train_eval/train_batch_count: 100] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 937.465] [train_eval/train_update_time: 536.388] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.966] [train_eval/perplexity_len_2048: 1060.422] [train_eval/loss_avg_len_1024: 6.972] [train_eval/perplexity_len_1024: 1066.603] [train_eval/loss_avg_len_512: 6.982] [train_eval/perplexity_len_512: 1077.249]
|
| 36 |
+
[2025-10-26 21:31:53][train:194][INFO] Running validation...
|
| 37 |
+
[2025-10-26 21:33:33][logger:171][INFO] [step: 209715200] [val/train_token_count: 209715200] [val/train_batch_count: 100] [val/train_flop_count: 0] [val/train_total_time: 937.465] [val/train_update_time: 536.388] [val/loss: 6.622] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.013] [val/val_tokens_per_second: 409544.725] [val/loss_avg_len_2048: 6.622] [val/perplexity_len_2048: 751.358] [val/loss_avg_len_1024: 6.628] [val/perplexity_len_1024: 756.021] [val/loss_avg_len_512: 6.644] [val/perplexity_len_512: 767.964]
|
| 38 |
+
[2025-10-26 21:33:33][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt...
|
| 39 |
+
[2025-10-26 21:33:34][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000209715200.pt.
|
| 40 |
+
[2025-10-26 21:33:34][logger:171][INFO] [step: 209715200] [checkpoint/checkpoint_time: 0.443]
|
| 41 |
+
[2025-10-26 21:34:27][utils:57][INFO] [P: 11.00%] [S: 230686720/2097152000] [T: 0:18:11] [ETA: 2:27:09] [loss: 6.560] [tokens/s: 201797.452] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 42 |
+
[2025-10-26 21:35:20][utils:57][INFO] [P: 12.00%] [S: 251658240/2097152000] [T: 0:19:04] [ETA: 2:19:54] [loss: 6.428] [tokens/s: 223522.183] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 43 |
+
[2025-10-26 21:35:20][train:194][INFO] Running validation...
|
| 44 |
+
[2025-10-26 21:37:01][logger:171][INFO] [step: 251658240] [val/train_token_count: 251658240] [val/train_batch_count: 120] [val/train_flop_count: 0] [val/train_total_time: 1144.691] [val/train_update_time: 642.942] [val/loss: 6.437] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.360] [val/val_tokens_per_second: 408129.969] [val/loss_avg_len_2048: 6.437] [val/perplexity_len_2048: 624.344] [val/loss_avg_len_1024: 6.444] [val/perplexity_len_1024: 628.922] [val/loss_avg_len_512: 6.462] [val/perplexity_len_512: 640.135]
|
| 45 |
+
[2025-10-26 21:37:54][utils:57][INFO] [P: 13.00%] [S: 272629760/2097152000] [T: 0:21:38] [ETA: 2:24:49] [loss: 6.382] [tokens/s: 201724.230] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 46 |
+
[2025-10-26 21:38:48][utils:57][INFO] [P: 14.00%] [S: 293601280/2097152000] [T: 0:22:31] [ETA: 2:18:23] [loss: 6.308] [tokens/s: 223338.996] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 47 |
+
[2025-10-26 21:38:48][train:194][INFO] Running validation...
|
| 48 |
+
[2025-10-26 21:40:27][logger:171][INFO] [step: 293601280] [val/train_token_count: 293601280] [val/train_batch_count: 140] [val/train_flop_count: 0] [val/train_total_time: 1351.811] [val/train_update_time: 749.497] [val/loss: 6.287] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.596] [val/val_tokens_per_second: 411261.156] [val/loss_avg_len_2048: 6.287] [val/perplexity_len_2048: 537.394] [val/loss_avg_len_1024: 6.295] [val/perplexity_len_1024: 541.773] [val/loss_avg_len_512: 6.314] [val/perplexity_len_512: 552.194]
|
| 49 |
+
[2025-10-26 21:41:21][utils:57][INFO] [P: 15.00%] [S: 314572800/2097152000] [T: 0:25:04] [ETA: 2:22:07] [loss: 6.193] [tokens/s: 201726.821] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 50 |
+
[2025-10-26 21:41:21][logger:171][INFO] [step: 314572800] [train_eval/train_token_count: 314572800] [train_eval/train_batch_count: 150] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1504.793] [train_eval/train_update_time: 802.770] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.409] [train_eval/perplexity_len_2048: 607.101] [train_eval/loss_avg_len_1024: 6.417] [train_eval/perplexity_len_1024: 612.351] [train_eval/loss_avg_len_512: 6.435] [train_eval/perplexity_len_512: 623.146]
|
| 51 |
+
[2025-10-26 21:42:14][utils:57][INFO] [P: 16.00%] [S: 335544320/2097152000] [T: 0:25:58] [ETA: 2:16:20] [loss: 6.161] [tokens/s: 223306.638] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 52 |
+
[2025-10-26 21:42:14][train:194][INFO] Running validation...
|
| 53 |
+
[2025-10-26 21:43:54][logger:171][INFO] [step: 335544320] [val/train_token_count: 335544320] [val/train_batch_count: 160] [val/train_flop_count: 0] [val/train_total_time: 1558.167] [val/train_update_time: 856.038] [val/loss: 6.162] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.557] [val/val_tokens_per_second: 411423.777] [val/loss_avg_len_2048: 6.162] [val/perplexity_len_2048: 474.439] [val/loss_avg_len_1024: 6.171] [val/perplexity_len_1024: 478.526] [val/loss_avg_len_512: 6.191] [val/perplexity_len_512: 488.116]
|
| 54 |
+
[2025-10-26 21:44:47][utils:57][INFO] [P: 17.00%] [S: 356515840/2097152000] [T: 0:28:31] [ETA: 2:19:14] [loss: 6.076] [tokens/s: 201707.332] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 55 |
+
[2025-10-26 21:45:40][utils:57][INFO] [P: 18.00%] [S: 377487360/2097152000] [T: 0:29:24] [ETA: 2:13:58] [loss: 6.050] [tokens/s: 223617.598] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 56 |
+
[2025-10-26 21:45:40][train:194][INFO] Running validation...
|
| 57 |
+
[2025-10-26 21:47:20][logger:171][INFO] [step: 377487360] [val/train_token_count: 377487360] [val/train_batch_count: 180] [val/train_flop_count: 0] [val/train_total_time: 1764.496] [val/train_update_time: 962.595] [val/loss: 6.044] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.502] [val/val_tokens_per_second: 411648.137] [val/loss_avg_len_2048: 6.044] [val/perplexity_len_2048: 421.379] [val/loss_avg_len_1024: 6.053] [val/perplexity_len_1024: 425.264] [val/loss_avg_len_512: 6.074] [val/perplexity_len_512: 434.297]
|
| 58 |
+
[2025-10-26 21:48:13][utils:57][INFO] [P: 19.00%] [S: 398458880/2097152000] [T: 0:31:57] [ETA: 2:16:14] [loss: 6.026] [tokens/s: 201973.076] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 59 |
+
[2025-10-26 21:49:07][utils:57][INFO] [P: 20.00%] [S: 419430400/2097152000] [T: 0:32:50] [ETA: 2:11:23] [loss: 5.940] [tokens/s: 223854.498] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 60 |
+
[2025-10-26 21:49:07][logger:171][INFO] [step: 419430400] [train_eval/train_token_count: 419430400] [train_eval/train_batch_count: 200] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 1970.751] [train_eval/train_update_time: 1069.133] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 6.078] [train_eval/perplexity_len_2048: 436.312] [train_eval/loss_avg_len_1024: 6.089] [train_eval/perplexity_len_1024: 440.880] [train_eval/loss_avg_len_512: 6.108] [train_eval/perplexity_len_512: 449.289]
|
| 61 |
+
[2025-10-26 21:49:07][train:194][INFO] Running validation...
|
| 62 |
+
[2025-10-26 21:50:46][logger:171][INFO] [step: 419430400] [val/train_token_count: 419430400] [val/train_batch_count: 200] [val/train_flop_count: 0] [val/train_total_time: 1970.751] [val/train_update_time: 1069.133] [val/loss: 5.947] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.669] [val/val_tokens_per_second: 410960.915] [val/loss_avg_len_2048: 5.947] [val/perplexity_len_2048: 382.423] [val/loss_avg_len_1024: 5.956] [val/perplexity_len_1024: 385.960] [val/loss_avg_len_512: 5.977] [val/perplexity_len_512: 394.296]
|
| 63 |
+
[2025-10-26 21:50:46][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt...
|
| 64 |
+
[2025-10-26 21:50:47][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000419430400.pt.
|
| 65 |
+
[2025-10-26 21:50:47][logger:171][INFO] [step: 419430400] [checkpoint/checkpoint_time: 0.435]
|
| 66 |
+
[2025-10-26 21:51:40][utils:57][INFO] [P: 21.00%] [S: 440401920/2097152000] [T: 0:35:24] [ETA: 2:13:11] [loss: 5.902] [tokens/s: 202045.969] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 67 |
+
[2025-10-26 21:52:33][utils:57][INFO] [P: 22.00%] [S: 461373440/2097152000] [T: 0:36:17] [ETA: 2:08:40] [loss: 5.880] [tokens/s: 223919.256] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 68 |
+
[2025-10-26 21:52:33][train:194][INFO] Running validation...
|
| 69 |
+
[2025-10-26 21:54:13][logger:171][INFO] [step: 461373440] [val/train_token_count: 461373440] [val/train_batch_count: 220] [val/train_flop_count: 0] [val/train_total_time: 2177.606] [val/train_update_time: 1175.684] [val/loss: 5.860] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.702] [val/val_tokens_per_second: 410824.908] [val/loss_avg_len_2048: 5.860] [val/perplexity_len_2048: 350.594] [val/loss_avg_len_1024: 5.869] [val/perplexity_len_1024: 354.067] [val/loss_avg_len_512: 5.892] [val/perplexity_len_512: 362.086]
|
| 70 |
+
[2025-10-26 21:55:06][utils:57][INFO] [P: 23.00%] [S: 482344960/2097152000] [T: 0:38:50] [ETA: 2:10:02] [loss: 5.811] [tokens/s: 202177.051] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 71 |
+
[2025-10-26 21:56:00][utils:57][INFO] [P: 24.00%] [S: 503316480/2097152000] [T: 0:39:44] [ETA: 2:05:49] [loss: 5.756] [tokens/s: 223897.508] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 72 |
+
[2025-10-26 21:56:00][train:194][INFO] Running validation...
|
| 73 |
+
[2025-10-26 21:57:39][logger:171][INFO] [step: 503316480] [val/train_token_count: 503316480] [val/train_batch_count: 240] [val/train_flop_count: 0] [val/train_total_time: 2384.052] [val/train_update_time: 1282.224] [val/loss: 5.783] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.661] [val/val_tokens_per_second: 410995.065] [val/loss_avg_len_2048: 5.783] [val/perplexity_len_2048: 324.736] [val/loss_avg_len_1024: 5.794] [val/perplexity_len_1024: 328.183] [val/loss_avg_len_512: 5.817] [val/perplexity_len_512: 335.952]
|
| 74 |
+
[2025-10-26 21:58:33][utils:57][INFO] [P: 25.00%] [S: 524288000/2097152000] [T: 0:42:17] [ETA: 2:06:51] [loss: 5.763] [tokens/s: 202168.583] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 75 |
+
[2025-10-26 21:58:33][logger:171][INFO] [step: 524288000] [train_eval/train_token_count: 524288000] [train_eval/train_batch_count: 250] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 2537.099] [train_eval/train_update_time: 1335.492] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.844] [train_eval/perplexity_len_2048: 345.304] [train_eval/loss_avg_len_1024: 5.854] [train_eval/perplexity_len_1024: 348.747] [train_eval/loss_avg_len_512: 5.874] [train_eval/perplexity_len_512: 355.783]
|
| 76 |
+
[2025-10-26 21:59:26][utils:57][INFO] [P: 26.00%] [S: 545259520/2097152000] [T: 0:43:10] [ETA: 2:02:52] [loss: 5.718] [tokens/s: 223871.889] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 77 |
+
[2025-10-26 21:59:26][train:194][INFO] Running validation...
|
| 78 |
+
[2025-10-26 22:01:06][logger:171][INFO] [step: 545259520] [val/train_token_count: 545259520] [val/train_batch_count: 260] [val/train_flop_count: 0] [val/train_total_time: 2590.474] [val/train_update_time: 1388.760] [val/loss: 5.716] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.564] [val/val_tokens_per_second: 411393.353] [val/loss_avg_len_2048: 5.716] [val/perplexity_len_2048: 303.822] [val/loss_avg_len_1024: 5.727] [val/perplexity_len_1024: 307.171] [val/loss_avg_len_512: 5.751] [val/perplexity_len_512: 314.628]
|
| 79 |
+
[2025-10-26 22:01:59][utils:57][INFO] [P: 27.00%] [S: 566231040/2097152000] [T: 0:45:43] [ETA: 2:03:37] [loss: 5.693] [tokens/s: 202168.506] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 80 |
+
[2025-10-26 22:02:53][utils:57][INFO] [P: 28.00%] [S: 587202560/2097152000] [T: 0:46:36] [ETA: 1:59:51] [loss: 5.643] [tokens/s: 223861.520] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 81 |
+
[2025-10-26 22:02:53][train:194][INFO] Running validation...
|
| 82 |
+
[2025-10-26 22:04:32][logger:171][INFO] [step: 587202560] [val/train_token_count: 587202560] [val/train_batch_count: 280] [val/train_flop_count: 0] [val/train_total_time: 2796.791] [val/train_update_time: 1495.303] [val/loss: 5.650] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.505] [val/val_tokens_per_second: 411637.736] [val/loss_avg_len_2048: 5.650] [val/perplexity_len_2048: 284.241] [val/loss_avg_len_1024: 5.661] [val/perplexity_len_1024: 287.545] [val/loss_avg_len_512: 5.686] [val/perplexity_len_512: 294.724]
|
| 83 |
+
[2025-10-26 22:05:25][utils:57][INFO] [P: 29.00%] [S: 608174080/2097152000] [T: 0:49:09] [ETA: 2:00:21] [loss: 5.611] [tokens/s: 202167.574] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 84 |
+
[2025-10-26 22:06:19][utils:57][INFO] [P: 30.00%] [S: 629145600/2097152000] [T: 0:50:03] [ETA: 1:56:47] [loss: 5.585] [tokens/s: 224004.908] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 85 |
+
[2025-10-26 22:06:19][logger:171][INFO] [step: 629145600] [train_eval/train_token_count: 629145600] [train_eval/train_batch_count: 300] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3003.058] [train_eval/train_update_time: 1601.853] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.678] [train_eval/perplexity_len_2048: 292.453] [train_eval/loss_avg_len_1024: 5.688] [train_eval/perplexity_len_1024: 295.384] [train_eval/loss_avg_len_512: 5.709] [train_eval/perplexity_len_512: 301.718]
|
| 86 |
+
[2025-10-26 22:06:19][train:194][INFO] Running validation...
|
| 87 |
+
[2025-10-26 22:07:58][logger:171][INFO] [step: 629145600] [val/train_token_count: 629145600] [val/train_batch_count: 300] [val/train_flop_count: 0] [val/train_total_time: 3003.058] [val/train_update_time: 1601.853] [val/loss: 5.596] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.440] [val/val_tokens_per_second: 411904.901] [val/loss_avg_len_2048: 5.596] [val/perplexity_len_2048: 269.218] [val/loss_avg_len_1024: 5.607] [val/perplexity_len_1024: 272.364] [val/loss_avg_len_512: 5.632] [val/perplexity_len_512: 279.226]
|
| 88 |
+
[2025-10-26 22:07:58][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt...
|
| 89 |
+
[2025-10-26 22:07:59][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000629145600.pt.
|
| 90 |
+
[2025-10-26 22:07:59][logger:171][INFO] [step: 629145600] [checkpoint/checkpoint_time: 0.449]
|
| 91 |
+
[2025-10-26 22:08:52][utils:57][INFO] [P: 31.00%] [S: 650117120/2097152000] [T: 0:52:36] [ETA: 1:57:05] [loss: 5.594] [tokens/s: 202209.727] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 92 |
+
[2025-10-26 22:09:45][utils:57][INFO] [P: 32.00%] [S: 671088640/2097152000] [T: 0:53:29] [ETA: 1:53:40] [loss: 5.528] [tokens/s: 223957.251] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 93 |
+
[2025-10-26 22:09:45][train:194][INFO] Running validation...
|
| 94 |
+
[2025-10-26 22:11:25][logger:171][INFO] [step: 671088640] [val/train_token_count: 671088640] [val/train_batch_count: 320] [val/train_flop_count: 0] [val/train_total_time: 3209.704] [val/train_update_time: 1708.403] [val/loss: 5.543] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.426] [val/val_tokens_per_second: 411964.367] [val/loss_avg_len_2048: 5.543] [val/perplexity_len_2048: 255.348] [val/loss_avg_len_1024: 5.554] [val/perplexity_len_1024: 258.396] [val/loss_avg_len_512: 5.580] [val/perplexity_len_512: 264.968]
|
| 95 |
+
[2025-10-26 22:12:18][utils:57][INFO] [P: 33.00%] [S: 692060160/2097152000] [T: 0:56:02] [ETA: 1:53:46] [loss: 5.541] [tokens/s: 202263.397] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 96 |
+
[2025-10-26 22:13:12][utils:57][INFO] [P: 34.00%] [S: 713031680/2097152000] [T: 0:56:55] [ETA: 1:50:30] [loss: 5.502] [tokens/s: 224011.145] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 97 |
+
[2025-10-26 22:13:12][train:194][INFO] Running validation...
|
| 98 |
+
[2025-10-26 22:14:53][logger:171][INFO] [step: 713031680] [val/train_token_count: 713031680] [val/train_batch_count: 340] [val/train_flop_count: 0] [val/train_total_time: 3415.885] [val/train_update_time: 1814.959] [val/loss: 5.496] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.346] [val/val_tokens_per_second: 404158.233] [val/loss_avg_len_2048: 5.496] [val/perplexity_len_2048: 243.690] [val/loss_avg_len_1024: 5.508] [val/perplexity_len_1024: 246.707] [val/loss_avg_len_512: 5.534] [val/perplexity_len_512: 253.158]
|
| 99 |
+
[2025-10-26 22:15:46][utils:57][INFO] [P: 35.00%] [S: 734003200/2097152000] [T: 0:59:30] [ETA: 1:50:31] [loss: 5.483] [tokens/s: 201930.682] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 100 |
+
[2025-10-26 22:15:46][logger:171][INFO] [step: 734003200] [train_eval/train_token_count: 734003200] [train_eval/train_batch_count: 350] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 3570.609] [train_eval/train_update_time: 1868.230] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.536] [train_eval/perplexity_len_2048: 253.699] [train_eval/loss_avg_len_1024: 5.548] [train_eval/perplexity_len_1024: 256.749] [train_eval/loss_avg_len_512: 5.571] [train_eval/perplexity_len_512: 262.706]
|
| 101 |
+
[2025-10-26 22:16:40][utils:57][INFO] [P: 36.00%] [S: 754974720/2097152000] [T: 1:00:23] [ETA: 1:47:22] [loss: 5.410] [tokens/s: 223584.636] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 102 |
+
[2025-10-26 22:16:40][train:194][INFO] Running validation...
|
| 103 |
+
[2025-10-26 22:18:21][logger:171][INFO] [step: 754974720] [val/train_token_count: 754974720] [val/train_batch_count: 360] [val/train_flop_count: 0] [val/train_total_time: 3623.981] [val/train_update_time: 1921.499] [val/loss: 5.454] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.680] [val/val_tokens_per_second: 402830.694] [val/loss_avg_len_2048: 5.454] [val/perplexity_len_2048: 233.736] [val/loss_avg_len_1024: 5.467] [val/perplexity_len_1024: 236.697] [val/loss_avg_len_512: 5.493] [val/perplexity_len_512: 242.938]
|
| 104 |
+
[2025-10-26 22:19:15][utils:57][INFO] [P: 37.00%] [S: 775946240/2097152000] [T: 1:02:59] [ETA: 1:47:14] [loss: 5.437] [tokens/s: 201513.247] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 105 |
+
[2025-10-26 22:20:08][utils:57][INFO] [P: 38.00%] [S: 796917760/2097152000] [T: 1:03:52] [ETA: 1:44:12] [loss: 5.413] [tokens/s: 223056.690] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 106 |
+
[2025-10-26 22:20:08][train:194][INFO] Running validation...
|
| 107 |
+
[2025-10-26 22:21:49][logger:171][INFO] [step: 796917760] [val/train_token_count: 796917760] [val/train_batch_count: 380] [val/train_flop_count: 0] [val/train_total_time: 3832.435] [val/train_update_time: 2028.044] [val/loss: 5.414] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.824] [val/val_tokens_per_second: 406251.122] [val/loss_avg_len_2048: 5.414] [val/perplexity_len_2048: 224.640] [val/loss_avg_len_1024: 5.427] [val/perplexity_len_1024: 227.492] [val/loss_avg_len_512: 5.453] [val/perplexity_len_512: 233.492]
|
| 108 |
+
[2025-10-26 22:22:42][utils:57][INFO] [P: 39.00%] [S: 817889280/2097152000] [T: 1:06:26] [ETA: 1:43:55] [loss: 5.414] [tokens/s: 201250.400] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 109 |
+
[2025-10-26 22:23:36][utils:57][INFO] [P: 40.00%] [S: 838860800/2097152000] [T: 1:07:20] [ETA: 1:41:00] [loss: 5.335] [tokens/s: 222827.518] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 110 |
+
[2025-10-26 22:23:36][logger:171][INFO] [step: 838860800] [train_eval/train_token_count: 838860800] [train_eval/train_batch_count: 400] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4040.047] [train_eval/train_update_time: 2134.612] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.429] [train_eval/perplexity_len_2048: 227.924] [train_eval/loss_avg_len_1024: 5.441] [train_eval/perplexity_len_1024: 230.568] [train_eval/loss_avg_len_512: 5.464] [train_eval/perplexity_len_512: 235.990]
|
| 111 |
+
[2025-10-26 22:23:36][train:194][INFO] Running validation...
|
| 112 |
+
[2025-10-26 22:25:15][logger:171][INFO] [step: 838860800] [val/train_token_count: 838860800] [val/train_batch_count: 400] [val/train_flop_count: 0] [val/train_total_time: 4040.047] [val/train_update_time: 2134.612] [val/loss: 5.380] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.629] [val/val_tokens_per_second: 411125.218] [val/loss_avg_len_2048: 5.380] [val/perplexity_len_2048: 217.034] [val/loss_avg_len_1024: 5.393] [val/perplexity_len_1024: 219.868] [val/loss_avg_len_512: 5.419] [val/perplexity_len_512: 225.741]
|
| 113 |
+
[2025-10-26 22:25:15][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt...
|
| 114 |
+
[2025-10-26 22:25:16][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000000838860800.pt.
|
| 115 |
+
[2025-10-26 22:25:16][logger:171][INFO] [step: 838860800] [checkpoint/checkpoint_time: 0.453]
|
| 116 |
+
[2025-10-26 22:26:09][utils:57][INFO] [P: 41.00%] [S: 859832320/2097152000] [T: 1:09:53] [ETA: 1:40:34] [loss: 5.345] [tokens/s: 201209.907] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 117 |
+
[2025-10-26 22:27:03][utils:57][INFO] [P: 42.00%] [S: 880803840/2097152000] [T: 1:10:46] [ETA: 1:37:44] [loss: 5.346] [tokens/s: 222662.081] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 118 |
+
[2025-10-26 22:27:03][train:194][INFO] Running validation...
|
| 119 |
+
[2025-10-26 22:28:43][logger:171][INFO] [step: 880803840] [val/train_token_count: 880803840] [val/train_batch_count: 420] [val/train_flop_count: 0] [val/train_total_time: 4246.918] [val/train_update_time: 2241.157] [val/loss: 5.350] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.115] [val/val_tokens_per_second: 409127.671] [val/loss_avg_len_2048: 5.350] [val/perplexity_len_2048: 210.547] [val/loss_avg_len_1024: 5.363] [val/perplexity_len_1024: 213.278] [val/loss_avg_len_512: 5.389] [val/perplexity_len_512: 218.952]
|
| 120 |
+
[2025-10-26 22:29:36][utils:57][INFO] [P: 43.00%] [S: 901775360/2097152000] [T: 1:13:20] [ETA: 1:37:13] [loss: 5.333] [tokens/s: 201068.550] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 121 |
+
[2025-10-26 22:30:30][utils:57][INFO] [P: 44.00%] [S: 922746880/2097152000] [T: 1:14:13] [ETA: 1:34:28] [loss: 5.353] [tokens/s: 222950.672] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 122 |
+
[2025-10-26 22:30:30][train:194][INFO] Running validation...
|
| 123 |
+
[2025-10-26 22:32:11][logger:171][INFO] [step: 922746880] [val/train_token_count: 922746880] [val/train_batch_count: 440] [val/train_flop_count: 0] [val/train_total_time: 4453.812] [val/train_update_time: 2347.708] [val/loss: 5.318] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.091] [val/val_tokens_per_second: 405178.307] [val/loss_avg_len_2048: 5.318] [val/perplexity_len_2048: 203.957] [val/loss_avg_len_1024: 5.331] [val/perplexity_len_1024: 206.669] [val/loss_avg_len_512: 5.358] [val/perplexity_len_512: 212.257]
|
| 124 |
+
[2025-10-26 22:33:04][utils:57][INFO] [P: 45.00%] [S: 943718400/2097152000] [T: 1:16:48] [ETA: 1:33:52] [loss: 5.297] [tokens/s: 201114.718] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 125 |
+
[2025-10-26 22:33:04][logger:171][INFO] [step: 943718400] [train_eval/train_token_count: 943718400] [train_eval/train_batch_count: 450] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 4608.292] [train_eval/train_update_time: 2400.978] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.341] [train_eval/perplexity_len_2048: 208.800] [train_eval/loss_avg_len_1024: 5.355] [train_eval/perplexity_len_1024: 211.619] [train_eval/loss_avg_len_512: 5.380] [train_eval/perplexity_len_512: 217.063]
|
| 126 |
+
[2025-10-26 22:33:57][utils:57][INFO] [P: 46.00%] [S: 964689920/2097152000] [T: 1:17:41] [ETA: 1:31:12] [loss: 5.275] [tokens/s: 223089.747] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 127 |
+
[2025-10-26 22:33:57][train:194][INFO] Running validation...
|
| 128 |
+
[2025-10-26 22:35:39][logger:171][INFO] [step: 964689920] [val/train_token_count: 964689920] [val/train_batch_count: 460] [val/train_flop_count: 0] [val/train_total_time: 4661.669] [val/train_update_time: 2454.251] [val/loss: 5.292] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.327] [val/val_tokens_per_second: 404234.005] [val/loss_avg_len_2048: 5.292] [val/perplexity_len_2048: 198.670] [val/loss_avg_len_1024: 5.305] [val/perplexity_len_1024: 201.322] [val/loss_avg_len_512: 5.332] [val/perplexity_len_512: 206.791]
|
| 129 |
+
[2025-10-26 22:36:32][utils:57][INFO] [P: 47.00%] [S: 985661440/2097152000] [T: 1:20:16] [ETA: 1:30:31] [loss: 5.291] [tokens/s: 201181.011] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 130 |
+
[2025-10-26 22:37:26][utils:57][INFO] [P: 48.00%] [S: 1006632960/2097152000] [T: 1:21:09] [ETA: 1:27:55] [loss: 5.264] [tokens/s: 222967.653] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 131 |
+
[2025-10-26 22:37:26][train:194][INFO] Running validation...
|
| 132 |
+
[2025-10-26 22:39:07][logger:171][INFO] [step: 1006632960] [val/train_token_count: 1006632960] [val/train_batch_count: 480] [val/train_flop_count: 0] [val/train_total_time: 4869.772] [val/train_update_time: 2560.803] [val/loss: 5.267] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.225] [val/val_tokens_per_second: 404642.453] [val/loss_avg_len_2048: 5.267] [val/perplexity_len_2048: 193.870] [val/loss_avg_len_1024: 5.281] [val/perplexity_len_1024: 196.476] [val/loss_avg_len_512: 5.308] [val/perplexity_len_512: 201.870]
|
| 133 |
+
[2025-10-26 22:40:00][utils:57][INFO] [P: 49.00%] [S: 1027604480/2097152000] [T: 1:23:44] [ETA: 1:27:09] [loss: 5.269] [tokens/s: 201109.121] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 134 |
+
[2025-10-26 22:40:54][utils:57][INFO] [P: 50.00%] [S: 1048576000/2097152000] [T: 1:24:37] [ETA: 1:24:37] [loss: 5.245] [tokens/s: 222703.660] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 135 |
+
[2025-10-26 22:40:54][logger:171][INFO] [step: 1048576000] [train_eval/train_token_count: 1048576000] [train_eval/train_batch_count: 500] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5077.749] [train_eval/train_update_time: 2667.341] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.274] [train_eval/perplexity_len_2048: 195.220] [train_eval/loss_avg_len_1024: 5.287] [train_eval/perplexity_len_1024: 197.736] [train_eval/loss_avg_len_512: 5.313] [train_eval/perplexity_len_512: 203.031]
|
| 136 |
+
[2025-10-26 22:40:54][train:194][INFO] Running validation...
|
| 137 |
+
[2025-10-26 22:42:35][logger:171][INFO] [step: 1048576000] [val/train_token_count: 1048576000] [val/train_batch_count: 500] [val/train_flop_count: 0] [val/train_total_time: 5077.749] [val/train_update_time: 2667.341] [val/loss: 5.244] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.128] [val/val_tokens_per_second: 405031.633] [val/loss_avg_len_2048: 5.244] [val/perplexity_len_2048: 189.484] [val/loss_avg_len_1024: 5.258] [val/perplexity_len_1024: 192.096] [val/loss_avg_len_512: 5.286] [val/perplexity_len_512: 197.462]
|
| 138 |
+
[2025-10-26 22:42:35][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt...
|
| 139 |
+
[2025-10-26 22:42:35][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001048576000.pt.
|
| 140 |
+
[2025-10-26 22:42:35][logger:171][INFO] [step: 1048576000] [checkpoint/checkpoint_time: 0.443]
|
| 141 |
+
[2025-10-26 22:43:28][utils:57][INFO] [P: 51.00%] [S: 1069547520/2097152000] [T: 1:27:12] [ETA: 1:23:47] [loss: 5.242] [tokens/s: 200822.919] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 142 |
+
[2025-10-26 22:44:22][utils:57][INFO] [P: 52.00%] [S: 1090519040/2097152000] [T: 1:28:06] [ETA: 1:21:19] [loss: 5.235] [tokens/s: 222358.851] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 143 |
+
[2025-10-26 22:44:22][train:194][INFO] Running validation...
|
| 144 |
+
[2025-10-26 22:46:04][logger:171][INFO] [step: 1090519040] [val/train_token_count: 1090519040] [val/train_batch_count: 520] [val/train_flop_count: 0] [val/train_total_time: 5286.093] [val/train_update_time: 2773.886] [val/loss: 5.224] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.726] [val/val_tokens_per_second: 402649.191] [val/loss_avg_len_2048: 5.224] [val/perplexity_len_2048: 185.696] [val/loss_avg_len_1024: 5.238] [val/perplexity_len_1024: 188.252] [val/loss_avg_len_512: 5.265] [val/perplexity_len_512: 193.484]
|
| 145 |
+
[2025-10-26 22:46:57][utils:57][INFO] [P: 53.00%] [S: 1111490560/2097152000] [T: 1:30:41] [ETA: 1:20:25] [loss: 5.193] [tokens/s: 200512.953] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 146 |
+
[2025-10-26 22:47:50][utils:57][INFO] [P: 54.00%] [S: 1132462080/2097152000] [T: 1:31:34] [ETA: 1:18:00] [loss: 5.209] [tokens/s: 222204.047] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 147 |
+
[2025-10-26 22:47:50][train:194][INFO] Running validation...
|
| 148 |
+
[2025-10-26 22:49:32][logger:171][INFO] [step: 1132462080] [val/train_token_count: 1132462080] [val/train_batch_count: 540] [val/train_flop_count: 0] [val/train_total_time: 5494.619] [val/train_update_time: 2880.433] [val/loss: 5.204] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.620] [val/val_tokens_per_second: 403068.537] [val/loss_avg_len_2048: 5.204] [val/perplexity_len_2048: 182.063] [val/loss_avg_len_1024: 5.218] [val/perplexity_len_1024: 184.597] [val/loss_avg_len_512: 5.246] [val/perplexity_len_512: 189.768]
|
| 149 |
+
[2025-10-26 22:50:25][utils:57][INFO] [P: 55.00%] [S: 1153433600/2097152000] [T: 1:34:09] [ETA: 1:17:02] [loss: 5.156] [tokens/s: 200408.069] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 150 |
+
[2025-10-26 22:50:25][logger:171][INFO] [step: 1153433600] [train_eval/train_token_count: 1153433600] [train_eval/train_batch_count: 550] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 5649.622] [train_eval/train_update_time: 2933.706] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.217] [train_eval/perplexity_len_2048: 184.366] [train_eval/loss_avg_len_1024: 5.227] [train_eval/perplexity_len_1024: 186.300] [train_eval/loss_avg_len_512: 5.251] [train_eval/perplexity_len_512: 190.837]
|
| 151 |
+
[2025-10-26 22:51:19][utils:57][INFO] [P: 56.00%] [S: 1174405120/2097152000] [T: 1:35:03] [ETA: 1:14:40] [loss: 5.195] [tokens/s: 222130.660] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 152 |
+
[2025-10-26 22:51:19][train:194][INFO] Running validation...
|
| 153 |
+
[2025-10-26 22:52:59][logger:171][INFO] [step: 1174405120] [val/train_token_count: 1174405120] [val/train_batch_count: 560] [val/train_flop_count: 0] [val/train_total_time: 5703.017] [val/train_update_time: 2986.992] [val/loss: 5.188] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.531] [val/val_tokens_per_second: 407436.220] [val/loss_avg_len_2048: 5.188] [val/perplexity_len_2048: 179.085] [val/loss_avg_len_1024: 5.202] [val/perplexity_len_1024: 181.607] [val/loss_avg_len_512: 5.230] [val/perplexity_len_512: 186.735]
|
| 154 |
+
[2025-10-26 22:53:53][utils:57][INFO] [P: 57.00%] [S: 1195376640/2097152000] [T: 1:37:36] [ETA: 1:13:38] [loss: 5.166] [tokens/s: 200559.622] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 155 |
+
[2025-10-26 22:54:46][utils:57][INFO] [P: 58.00%] [S: 1216348160/2097152000] [T: 1:38:30] [ETA: 1:11:19] [loss: 5.197] [tokens/s: 222297.302] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 156 |
+
[2025-10-26 22:54:46][train:194][INFO] Running validation...
|
| 157 |
+
[2025-10-26 22:56:26][logger:171][INFO] [step: 1216348160] [val/train_token_count: 1216348160] [val/train_batch_count: 580] [val/train_flop_count: 0] [val/train_total_time: 5910.319] [val/train_update_time: 3093.534] [val/loss: 5.172] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.040] [val/val_tokens_per_second: 409434.268] [val/loss_avg_len_2048: 5.172] [val/perplexity_len_2048: 176.189] [val/loss_avg_len_1024: 5.186] [val/perplexity_len_1024: 178.683] [val/loss_avg_len_512: 5.214] [val/perplexity_len_512: 183.758]
|
| 158 |
+
[2025-10-26 22:57:20][utils:57][INFO] [P: 59.00%] [S: 1237319680/2097152000] [T: 1:41:03] [ETA: 1:10:13] [loss: 5.171] [tokens/s: 200785.912] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 159 |
+
[2025-10-26 22:58:13][utils:57][INFO] [P: 60.00%] [S: 1258291200/2097152000] [T: 1:41:57] [ETA: 1:07:58] [loss: 5.175] [tokens/s: 222658.190] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 160 |
+
[2025-10-26 22:58:13][logger:171][INFO] [step: 1258291200] [train_eval/train_token_count: 1258291200] [train_eval/train_batch_count: 600] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6117.131] [train_eval/train_update_time: 3200.081] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.172] [train_eval/perplexity_len_2048: 176.260] [train_eval/loss_avg_len_1024: 5.182] [train_eval/perplexity_len_1024: 178.088] [train_eval/loss_avg_len_512: 5.207] [train_eval/perplexity_len_512: 182.607]
|
| 161 |
+
[2025-10-26 22:58:13][train:194][INFO] Running validation...
|
| 162 |
+
[2025-10-26 22:59:53][logger:171][INFO] [step: 1258291200] [val/train_token_count: 1258291200] [val/train_batch_count: 600] [val/train_flop_count: 0] [val/train_total_time: 6117.131] [val/train_update_time: 3200.081] [val/loss: 5.157] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.893] [val/val_tokens_per_second: 410039.290] [val/loss_avg_len_2048: 5.157] [val/perplexity_len_2048: 173.723] [val/loss_avg_len_1024: 5.172] [val/perplexity_len_1024: 176.211] [val/loss_avg_len_512: 5.200] [val/perplexity_len_512: 181.238]
|
| 163 |
+
[2025-10-26 22:59:53][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt...
|
| 164 |
+
[2025-10-26 22:59:53][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001258291200.pt.
|
| 165 |
+
[2025-10-26 22:59:53][logger:171][INFO] [step: 1258291200] [checkpoint/checkpoint_time: 0.449]
|
| 166 |
+
[2025-10-26 23:00:47][utils:57][INFO] [P: 61.00%] [S: 1279262720/2097152000] [T: 1:44:30] [ETA: 1:06:49] [loss: 5.161] [tokens/s: 201017.630] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 167 |
+
[2025-10-26 23:01:40][utils:57][INFO] [P: 62.00%] [S: 1300234240/2097152000] [T: 1:45:24] [ETA: 1:04:36] [loss: 5.136] [tokens/s: 222982.480] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 168 |
+
[2025-10-26 23:01:40][train:194][INFO] Running validation...
|
| 169 |
+
[2025-10-26 23:03:20][logger:171][INFO] [step: 1300234240] [val/train_token_count: 1300234240] [val/train_batch_count: 620] [val/train_flop_count: 0] [val/train_total_time: 6324.271] [val/train_update_time: 3306.647] [val/loss: 5.144] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.145] [val/val_tokens_per_second: 409008.036] [val/loss_avg_len_2048: 5.144] [val/perplexity_len_2048: 171.381] [val/loss_avg_len_1024: 5.158] [val/perplexity_len_1024: 173.834] [val/loss_avg_len_512: 5.186] [val/perplexity_len_512: 178.815]
|
| 170 |
+
[2025-10-26 23:04:14][utils:57][INFO] [P: 63.00%] [S: 1321205760/2097152000] [T: 1:47:57] [ETA: 1:03:24] [loss: 5.146] [tokens/s: 201329.619] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 171 |
+
[2025-10-26 23:05:07][utils:57][INFO] [P: 64.00%] [S: 1342177280/2097152000] [T: 1:48:51] [ETA: 1:01:13] [loss: 5.153] [tokens/s: 223342.819] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 172 |
+
[2025-10-26 23:05:07][train:194][INFO] Running validation...
|
| 173 |
+
[2025-10-26 23:06:47][logger:171][INFO] [step: 1342177280] [val/train_token_count: 1342177280] [val/train_batch_count: 640] [val/train_flop_count: 0] [val/train_total_time: 6531.186] [val/train_update_time: 3413.191] [val/loss: 5.131] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.948] [val/val_tokens_per_second: 409812.777] [val/loss_avg_len_2048: 5.131] [val/perplexity_len_2048: 169.188] [val/loss_avg_len_1024: 5.145] [val/perplexity_len_1024: 171.645] [val/loss_avg_len_512: 5.174] [val/perplexity_len_512: 176.611]
|
| 174 |
+
[2025-10-26 23:07:40][utils:57][INFO] [P: 65.00%] [S: 1363148800/2097152000] [T: 1:51:24] [ETA: 0:59:59] [loss: 5.122] [tokens/s: 201658.399] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 175 |
+
[2025-10-26 23:07:40][logger:171][INFO] [step: 1363148800] [train_eval/train_token_count: 1363148800] [train_eval/train_batch_count: 650] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 6684.524] [train_eval/train_update_time: 3466.462] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.145] [train_eval/perplexity_len_2048: 171.577] [train_eval/loss_avg_len_1024: 5.160] [train_eval/perplexity_len_1024: 174.117] [train_eval/loss_avg_len_512: 5.185] [train_eval/perplexity_len_512: 178.545]
|
| 176 |
+
[2025-10-26 23:08:34][utils:57][INFO] [P: 66.00%] [S: 1384120320/2097152000] [T: 1:52:17] [ETA: 0:57:51] [loss: 5.144] [tokens/s: 223482.439] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 177 |
+
[2025-10-26 23:08:34][train:194][INFO] Running validation...
|
| 178 |
+
[2025-10-26 23:10:13][logger:171][INFO] [step: 1384120320] [val/train_token_count: 1384120320] [val/train_batch_count: 660] [val/train_flop_count: 0] [val/train_total_time: 6737.915] [val/train_update_time: 3519.732] [val/loss: 5.120] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.735] [val/val_tokens_per_second: 410687.485] [val/loss_avg_len_2048: 5.120] [val/perplexity_len_2048: 167.401] [val/loss_avg_len_1024: 5.135] [val/perplexity_len_1024: 169.836] [val/loss_avg_len_512: 5.163] [val/perplexity_len_512: 174.755]
|
| 179 |
+
[2025-10-26 23:11:07][utils:57][INFO] [P: 67.00%] [S: 1405091840/2097152000] [T: 1:54:51] [ETA: 0:56:34] [loss: 5.096] [tokens/s: 201810.947] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 180 |
+
[2025-10-26 23:12:00][utils:57][INFO] [P: 68.00%] [S: 1426063360/2097152000] [T: 1:55:44] [ETA: 0:54:27] [loss: 5.104] [tokens/s: 223547.884] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 181 |
+
[2025-10-26 23:12:00][train:194][INFO] Running validation...
|
| 182 |
+
[2025-10-26 23:13:40][logger:171][INFO] [step: 1426063360] [val/train_token_count: 1426063360] [val/train_batch_count: 680] [val/train_flop_count: 0] [val/train_total_time: 6944.454] [val/train_update_time: 3626.288] [val/loss: 5.111] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.924] [val/val_tokens_per_second: 409912.887] [val/loss_avg_len_2048: 5.111] [val/perplexity_len_2048: 165.787] [val/loss_avg_len_1024: 5.125] [val/perplexity_len_1024: 168.201] [val/loss_avg_len_512: 5.154] [val/perplexity_len_512: 173.115]
|
| 183 |
+
[2025-10-26 23:14:34][utils:57][INFO] [P: 69.00%] [S: 1447034880/2097152000] [T: 1:58:17] [ETA: 0:53:08] [loss: 5.114] [tokens/s: 201831.092] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 184 |
+
[2025-10-26 23:15:27][utils:57][INFO] [P: 70.00%] [S: 1468006400/2097152000] [T: 1:59:11] [ETA: 0:51:04] [loss: 5.101] [tokens/s: 223651.229] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 185 |
+
[2025-10-26 23:15:27][logger:171][INFO] [step: 1468006400] [train_eval/train_token_count: 1468006400] [train_eval/train_batch_count: 700] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7151.144] [train_eval/train_update_time: 3732.833] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.112] [train_eval/perplexity_len_2048: 166.072] [train_eval/loss_avg_len_1024: 5.126] [train_eval/perplexity_len_1024: 168.424] [train_eval/loss_avg_len_512: 5.155] [train_eval/perplexity_len_512: 173.216]
|
| 186 |
+
[2025-10-26 23:15:27][train:194][INFO] Running validation...
|
| 187 |
+
[2025-10-26 23:17:07][logger:171][INFO] [step: 1468006400] [val/train_token_count: 1468006400] [val/train_batch_count: 700] [val/train_flop_count: 0] [val/train_total_time: 7151.144] [val/train_update_time: 3732.833] [val/loss: 5.101] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.963] [val/val_tokens_per_second: 409749.591] [val/loss_avg_len_2048: 5.101] [val/perplexity_len_2048: 164.254] [val/loss_avg_len_1024: 5.116] [val/perplexity_len_1024: 166.669] [val/loss_avg_len_512: 5.145] [val/perplexity_len_512: 171.560]
|
| 188 |
+
[2025-10-26 23:17:07][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt...
|
| 189 |
+
[2025-10-26 23:17:07][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001468006400.pt.
|
| 190 |
+
[2025-10-26 23:17:07][logger:171][INFO] [step: 1468006400] [checkpoint/checkpoint_time: 0.452]
|
| 191 |
+
[2025-10-26 23:18:01][utils:57][INFO] [P: 71.00%] [S: 1488977920/2097152000] [T: 2:01:44] [ETA: 0:49:43] [loss: 5.092] [tokens/s: 201822.272] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 192 |
+
[2025-10-26 23:18:54][utils:57][INFO] [P: 72.00%] [S: 1509949440/2097152000] [T: 2:02:38] [ETA: 0:47:41] [loss: 5.101] [tokens/s: 223589.767] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 193 |
+
[2025-10-26 23:18:54][train:194][INFO] Running validation...
|
| 194 |
+
[2025-10-26 23:20:34][logger:171][INFO] [step: 1509949440] [val/train_token_count: 1509949440] [val/train_batch_count: 720] [val/train_flop_count: 0] [val/train_total_time: 7358.337] [val/train_update_time: 3839.386] [val/loss: 5.093] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.891] [val/val_tokens_per_second: 410045.618] [val/loss_avg_len_2048: 5.093] [val/perplexity_len_2048: 162.938] [val/loss_avg_len_1024: 5.108] [val/perplexity_len_1024: 165.350] [val/loss_avg_len_512: 5.137] [val/perplexity_len_512: 170.232]
|
| 195 |
+
[2025-10-26 23:21:27][utils:57][INFO] [P: 73.00%] [S: 1530920960/2097152000] [T: 2:05:11] [ETA: 0:46:18] [loss: 5.109] [tokens/s: 201870.189] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 196 |
+
[2025-10-26 23:22:21][utils:57][INFO] [P: 74.00%] [S: 1551892480/2097152000] [T: 2:06:05] [ETA: 0:44:17] [loss: 5.105] [tokens/s: 223595.012] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 197 |
+
[2025-10-26 23:22:21][train:194][INFO] Running validation...
|
| 198 |
+
[2025-10-26 23:24:01][logger:171][INFO] [step: 1551892480] [val/train_token_count: 1551892480] [val/train_batch_count: 740] [val/train_flop_count: 0] [val/train_total_time: 7565.031] [val/train_update_time: 3945.941] [val/loss: 5.086] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.777] [val/val_tokens_per_second: 410516.517] [val/loss_avg_len_2048: 5.086] [val/perplexity_len_2048: 161.804] [val/loss_avg_len_1024: 5.101] [val/perplexity_len_1024: 164.202] [val/loss_avg_len_512: 5.130] [val/perplexity_len_512: 169.052]
|
| 199 |
+
[2025-10-26 23:24:54][utils:57][INFO] [P: 75.00%] [S: 1572864000/2097152000] [T: 2:08:38] [ETA: 0:42:52] [loss: 5.097] [tokens/s: 201891.999] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 200 |
+
[2025-10-26 23:24:54][logger:171][INFO] [step: 1572864000] [train_eval/train_token_count: 1572864000] [train_eval/train_batch_count: 750] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 7718.234] [train_eval/train_update_time: 3999.225] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.092] [train_eval/perplexity_len_2048: 162.718] [train_eval/loss_avg_len_1024: 5.107] [train_eval/perplexity_len_1024: 165.165] [train_eval/loss_avg_len_512: 5.134] [train_eval/perplexity_len_512: 169.733]
|
| 201 |
+
[2025-10-26 23:25:47][utils:57][INFO] [P: 76.00%] [S: 1593835520/2097152000] [T: 2:09:31] [ETA: 0:40:54] [loss: 5.053] [tokens/s: 223573.837] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 202 |
+
[2025-10-26 23:25:47][train:194][INFO] Running validation...
|
| 203 |
+
[2025-10-26 23:27:27][logger:171][INFO] [step: 1593835520] [val/train_token_count: 1593835520] [val/train_batch_count: 760] [val/train_flop_count: 0] [val/train_total_time: 7771.637] [val/train_update_time: 4052.504] [val/loss: 5.080] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.857] [val/val_tokens_per_second: 410185.320] [val/loss_avg_len_2048: 5.080] [val/perplexity_len_2048: 160.844] [val/loss_avg_len_1024: 5.095] [val/perplexity_len_1024: 163.238] [val/loss_avg_len_512: 5.124] [val/perplexity_len_512: 168.074]
|
| 204 |
+
[2025-10-26 23:28:21][utils:57][INFO] [P: 77.00%] [S: 1614807040/2097152000] [T: 2:12:04] [ETA: 0:39:27] [loss: 5.113] [tokens/s: 201867.208] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 205 |
+
[2025-10-26 23:29:14][utils:57][INFO] [P: 78.00%] [S: 1635778560/2097152000] [T: 2:12:58] [ETA: 0:37:30] [loss: 5.037] [tokens/s: 223591.734] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 206 |
+
[2025-10-26 23:29:14][train:194][INFO] Running validation...
|
| 207 |
+
[2025-10-26 23:30:56][logger:171][INFO] [step: 1635778560] [val/train_token_count: 1635778560] [val/train_batch_count: 780] [val/train_flop_count: 0] [val/train_total_time: 7978.290] [val/train_update_time: 4159.058] [val/loss: 5.075] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.504] [val/val_tokens_per_second: 403532.450] [val/loss_avg_len_2048: 5.075] [val/perplexity_len_2048: 159.990] [val/loss_avg_len_1024: 5.090] [val/perplexity_len_1024: 162.369] [val/loss_avg_len_512: 5.119] [val/perplexity_len_512: 167.195]
|
| 208 |
+
[2025-10-26 23:31:49][utils:57][INFO] [P: 79.00%] [S: 1656750080/2097152000] [T: 2:15:33] [ETA: 0:36:01] [loss: 5.071] [tokens/s: 201551.794] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 209 |
+
[2025-10-26 23:32:42][utils:57][INFO] [P: 80.00%] [S: 1677721600/2097152000] [T: 2:16:26] [ETA: 0:34:06] [loss: 5.039] [tokens/s: 223322.014] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 210 |
+
[2025-10-26 23:32:42][logger:171][INFO] [step: 1677721600] [train_eval/train_token_count: 1677721600] [train_eval/train_batch_count: 800] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8186.597] [train_eval/train_update_time: 4265.605] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.077] [train_eval/perplexity_len_2048: 160.213] [train_eval/loss_avg_len_1024: 5.092] [train_eval/perplexity_len_1024: 162.774] [train_eval/loss_avg_len_512: 5.120] [train_eval/perplexity_len_512: 167.277]
|
| 211 |
+
[2025-10-26 23:32:42][train:194][INFO] Running validation...
|
| 212 |
+
[2025-10-26 23:34:24][logger:171][INFO] [step: 1677721600] [val/train_token_count: 1677721600] [val/train_batch_count: 800] [val/train_flop_count: 0] [val/train_total_time: 8186.597] [val/train_update_time: 4265.605] [val/loss: 5.070] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.590] [val/val_tokens_per_second: 403191.092] [val/loss_avg_len_2048: 5.070] [val/perplexity_len_2048: 159.243] [val/loss_avg_len_1024: 5.085] [val/perplexity_len_1024: 161.622] [val/loss_avg_len_512: 5.115] [val/perplexity_len_512: 166.439]
|
| 213 |
+
[2025-10-26 23:34:24][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt...
|
| 214 |
+
[2025-10-26 23:34:24][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001677721600.pt.
|
| 215 |
+
[2025-10-26 23:34:24][logger:171][INFO] [step: 1677721600] [checkpoint/checkpoint_time: 0.434]
|
| 216 |
+
[2025-10-26 23:35:18][utils:57][INFO] [P: 81.00%] [S: 1698693120/2097152000] [T: 2:19:02] [ETA: 0:32:36] [loss: 5.020] [tokens/s: 201236.804] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 217 |
+
[2025-10-26 23:36:11][utils:57][INFO] [P: 82.00%] [S: 1719664640/2097152000] [T: 2:19:55] [ETA: 0:30:42] [loss: 5.034] [tokens/s: 222808.847] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 218 |
+
[2025-10-26 23:36:11][train:194][INFO] Running validation...
|
| 219 |
+
[2025-10-26 23:37:53][logger:171][INFO] [step: 1719664640] [val/train_token_count: 1719664640] [val/train_batch_count: 820] [val/train_flop_count: 0] [val/train_total_time: 8395.407] [val/train_update_time: 4372.160] [val/loss: 5.066] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 101.431] [val/val_tokens_per_second: 403819.955] [val/loss_avg_len_2048: 5.066] [val/perplexity_len_2048: 158.613] [val/loss_avg_len_1024: 5.081] [val/perplexity_len_1024: 160.994] [val/loss_avg_len_512: 5.111] [val/perplexity_len_512: 165.808]
|
| 220 |
+
[2025-10-26 23:38:46][utils:57][INFO] [P: 83.00%] [S: 1740636160/2097152000] [T: 2:22:30] [ETA: 0:29:11] [loss: 5.062] [tokens/s: 200933.563] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 221 |
+
[2025-10-26 23:39:39][utils:57][INFO] [P: 84.00%] [S: 1761607680/2097152000] [T: 2:23:23] [ETA: 0:27:18] [loss: 5.039] [tokens/s: 222414.750] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 222 |
+
[2025-10-26 23:39:39][train:194][INFO] Running validation...
|
| 223 |
+
[2025-10-26 23:41:19][logger:171][INFO] [step: 1761607680] [val/train_token_count: 1761607680] [val/train_batch_count: 840] [val/train_flop_count: 0] [val/train_total_time: 8603.637] [val/train_update_time: 4478.716] [val/loss: 5.063] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.801] [val/val_tokens_per_second: 410416.185] [val/loss_avg_len_2048: 5.063] [val/perplexity_len_2048: 158.094] [val/loss_avg_len_1024: 5.078] [val/perplexity_len_1024: 160.470] [val/loss_avg_len_512: 5.108] [val/perplexity_len_512: 165.272]
|
| 224 |
+
[2025-10-26 23:42:13][utils:57][INFO] [P: 85.00%] [S: 1782579200/2097152000] [T: 2:25:56] [ETA: 0:25:45] [loss: 5.099] [tokens/s: 200938.684] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 225 |
+
[2025-10-26 23:42:13][logger:171][INFO] [step: 1782579200] [train_eval/train_token_count: 1782579200] [train_eval/train_batch_count: 850] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 8756.834] [train_eval/train_update_time: 4531.993] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.061] [train_eval/perplexity_len_2048: 157.686] [train_eval/loss_avg_len_1024: 5.071] [train_eval/perplexity_len_1024: 159.400] [train_eval/loss_avg_len_512: 5.099] [train_eval/perplexity_len_512: 163.821]
|
| 226 |
+
[2025-10-26 23:43:06][utils:57][INFO] [P: 86.00%] [S: 1803550720/2097152000] [T: 2:26:50] [ETA: 0:23:54] [loss: 5.075] [tokens/s: 222436.658] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 227 |
+
[2025-10-26 23:43:06][train:194][INFO] Running validation...
|
| 228 |
+
[2025-10-26 23:44:46][logger:171][INFO] [step: 1803550720] [val/train_token_count: 1803550720] [val/train_batch_count: 860] [val/train_flop_count: 0] [val/train_total_time: 8810.230] [val/train_update_time: 4585.267] [val/loss: 5.061] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.596] [val/val_tokens_per_second: 411261.912] [val/loss_avg_len_2048: 5.061] [val/perplexity_len_2048: 157.677] [val/loss_avg_len_1024: 5.075] [val/perplexity_len_1024: 160.050] [val/loss_avg_len_512: 5.105] [val/perplexity_len_512: 164.844]
|
| 229 |
+
[2025-10-26 23:45:39][utils:57][INFO] [P: 87.00%] [S: 1824522240/2097152000] [T: 2:29:23] [ETA: 0:22:19] [loss: 5.019] [tokens/s: 200990.678] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 230 |
+
[2025-10-26 23:46:32][utils:57][INFO] [P: 88.00%] [S: 1845493760/2097152000] [T: 2:30:16] [ETA: 0:20:29] [loss: 5.056] [tokens/s: 222896.705] [batches/s: 0.106] [MFU: 0.000] [TFLOPS: 0.000]
|
| 231 |
+
[2025-10-26 23:46:32][train:194][INFO] Running validation...
|
| 232 |
+
[2025-10-26 23:48:12][logger:171][INFO] [step: 1845493760] [val/train_token_count: 1845493760] [val/train_batch_count: 880] [val/train_flop_count: 0] [val/train_total_time: 9016.607] [val/train_update_time: 4691.817] [val/loss: 5.058] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.554] [val/val_tokens_per_second: 411433.791] [val/loss_avg_len_2048: 5.058] [val/perplexity_len_2048: 157.352] [val/loss_avg_len_1024: 5.073] [val/perplexity_len_1024: 159.725] [val/loss_avg_len_512: 5.103] [val/perplexity_len_512: 164.520]
|
| 233 |
+
[2025-10-26 23:49:05][utils:57][INFO] [P: 89.00%] [S: 1866465280/2097152000] [T: 2:32:49] [ETA: 0:18:53] [loss: 5.094] [tokens/s: 201373.463] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 234 |
+
[2025-10-26 23:49:59][utils:57][INFO] [P: 90.00%] [S: 1887436800/2097152000] [T: 2:33:42] [ETA: 0:17:04] [loss: 5.027] [tokens/s: 223490.653] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 235 |
+
[2025-10-26 23:49:59][logger:171][INFO] [step: 1887436800] [train_eval/train_token_count: 1887436800] [train_eval/train_batch_count: 900] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9222.956] [train_eval/train_update_time: 4798.368] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.061] [train_eval/perplexity_len_2048: 157.725] [train_eval/loss_avg_len_1024: 5.075] [train_eval/perplexity_len_1024: 160.002] [train_eval/loss_avg_len_512: 5.102] [train_eval/perplexity_len_512: 164.418]
|
| 236 |
+
[2025-10-26 23:49:59][train:194][INFO] Running validation...
|
| 237 |
+
[2025-10-26 23:51:39][logger:171][INFO] [step: 1887436800] [val/train_token_count: 1887436800] [val/train_batch_count: 900] [val/train_flop_count: 0] [val/train_total_time: 9222.956] [val/train_update_time: 4798.368] [val/loss: 5.057] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.031] [val/val_tokens_per_second: 409473.595] [val/loss_avg_len_2048: 5.057] [val/perplexity_len_2048: 157.117] [val/loss_avg_len_1024: 5.072] [val/perplexity_len_1024: 159.489] [val/loss_avg_len_512: 5.102] [val/perplexity_len_512: 164.282]
|
| 238 |
+
[2025-10-26 23:51:39][checkpoint:111][INFO] Saving checkpoint to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt...
|
| 239 |
+
[2025-10-26 23:51:39][checkpoint:128][INFO] Checkpoint saved to /workspace/forgetting-transformer/alibi_2_4_256/checkpoints/step-000001887436800.pt.
|
| 240 |
+
[2025-10-26 23:51:39][logger:171][INFO] [step: 1887436800] [checkpoint/checkpoint_time: 0.442]
|
| 241 |
+
[2025-10-26 23:52:33][utils:57][INFO] [P: 91.00%] [S: 1908408320/2097152000] [T: 2:36:16] [ETA: 0:15:27] [loss: 5.049] [tokens/s: 201672.909] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 242 |
+
[2025-10-26 23:53:26][utils:57][INFO] [P: 92.00%] [S: 1929379840/2097152000] [T: 2:37:10] [ETA: 0:13:40] [loss: 5.089] [tokens/s: 223717.151] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 243 |
+
[2025-10-26 23:53:26][train:194][INFO] Running validation...
|
| 244 |
+
[2025-10-26 23:55:06][logger:171][INFO] [step: 1929379840] [val/train_token_count: 1929379840] [val/train_batch_count: 920] [val/train_flop_count: 0] [val/train_total_time: 9430.235] [val/train_update_time: 4904.913] [val/loss: 5.056] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.070] [val/val_tokens_per_second: 409313.027] [val/loss_avg_len_2048: 5.056] [val/perplexity_len_2048: 156.924] [val/loss_avg_len_1024: 5.071] [val/perplexity_len_1024: 159.296] [val/loss_avg_len_512: 5.100] [val/perplexity_len_512: 164.083]
|
| 245 |
+
[2025-10-26 23:55:59][utils:57][INFO] [P: 93.00%] [S: 1950351360/2097152000] [T: 2:39:43] [ETA: 0:12:01] [loss: 5.060] [tokens/s: 201942.074] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 246 |
+
[2025-10-26 23:56:53][utils:57][INFO] [P: 94.00%] [S: 1971322880/2097152000] [T: 2:40:37] [ETA: 0:10:15] [loss: 5.021] [tokens/s: 223658.926] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 247 |
+
[2025-10-26 23:56:53][train:194][INFO] Running validation...
|
| 248 |
+
[2025-10-26 23:58:33][logger:171][INFO] [step: 1971322880] [val/train_token_count: 1971322880] [val/train_batch_count: 940] [val/train_flop_count: 0] [val/train_total_time: 9637.073] [val/train_update_time: 5011.441] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 100.583] [val/val_tokens_per_second: 407224.794] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.817] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.188] [val/loss_avg_len_512: 5.100] [val/perplexity_len_512: 163.978]
|
| 249 |
+
[2025-10-26 23:59:27][utils:57][INFO] [P: 95.00%] [S: 1992294400/2097152000] [T: 2:43:11] [ETA: 0:08:35] [loss: 5.055] [tokens/s: 201792.021] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 250 |
+
[2025-10-26 23:59:27][logger:171][INFO] [step: 1992294400] [train_eval/train_token_count: 1992294400] [train_eval/train_batch_count: 950] [train_eval/train_flop_count: 0] [train_eval/train_total_time: 9791.052] [train_eval/train_update_time: 5064.716] [train_eval/window_seq_count: 51200] [train_eval/window_token_count: 104857600] [train_eval/loss_avg_len_2048: 5.052] [train_eval/perplexity_len_2048: 156.309] [train_eval/loss_avg_len_1024: 5.068] [train_eval/perplexity_len_1024: 158.884] [train_eval/loss_avg_len_512: 5.095] [train_eval/perplexity_len_512: 163.199]
|
| 251 |
+
[2025-10-27 00:00:20][utils:57][INFO] [P: 96.00%] [S: 2013265920/2097152000] [T: 2:44:04] [ETA: 0:06:50] [loss: 5.061] [tokens/s: 223422.133] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 252 |
+
[2025-10-27 00:00:20][train:194][INFO] Running validation...
|
| 253 |
+
[2025-10-27 00:02:00][logger:171][INFO] [step: 2013265920] [val/train_token_count: 2013265920] [val/train_batch_count: 960] [val/train_flop_count: 0] [val/train_total_time: 9844.444] [val/train_update_time: 5117.988] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.617] [val/val_tokens_per_second: 411173.096] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.755] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.125] [val/loss_avg_len_512: 5.099] [val/perplexity_len_512: 163.911]
|
| 254 |
+
[2025-10-27 00:02:53][utils:57][INFO] [P: 97.00%] [S: 2034237440/2097152000] [T: 2:46:37] [ETA: 0:05:09] [loss: 5.073] [tokens/s: 201787.348] [batches/s: 0.096] [MFU: 0.000] [TFLOPS: 0.000]
|
| 255 |
+
[2025-10-27 00:03:47][utils:57][INFO] [P: 98.00%] [S: 2055208960/2097152000] [T: 2:47:30] [ETA: 0:03:25] [loss: 5.058] [tokens/s: 223406.851] [batches/s: 0.107] [MFU: 0.000] [TFLOPS: 0.000]
|
| 256 |
+
[2025-10-27 00:03:47][train:194][INFO] Running validation...
|
| 257 |
+
[2025-10-27 00:05:26][logger:171][INFO] [step: 2055208960] [val/train_token_count: 2055208960] [val/train_batch_count: 980] [val/train_flop_count: 0] [val/train_total_time: 10050.842] [val/train_update_time: 5224.531] [val/loss: 5.055] [val/val_token_count: 40960000] [val/val_seq_count: 20000] [val/val_time: 99.766] [val/val_tokens_per_second: 410561.508] [val/loss_avg_len_2048: 5.055] [val/perplexity_len_2048: 156.726] [val/loss_avg_len_1024: 5.070] [val/perplexity_len_1024: 159.097] [val/loss_avg_len_512: 5.099] [val/perplexity_len_512: 163.883]
|
| 258 |
+
[2025-10-27 00:05:26][train:854][INFO] Training finished with 2055208960 tokens!
|
metrics/jsonlines/checkpoint.jsonl
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
-
{"step": 209715200, "checkpoint/checkpoint_time": 0.
|
| 2 |
-
{"step": 419430400, "checkpoint/checkpoint_time": 0.
|
| 3 |
-
{"step": 629145600, "checkpoint/checkpoint_time": 0.
|
| 4 |
-
{"step": 838860800, "checkpoint/checkpoint_time": 0.
|
| 5 |
-
{"step": 1048576000, "checkpoint/checkpoint_time": 0.
|
| 6 |
-
{"step": 1258291200, "checkpoint/checkpoint_time": 0.
|
| 7 |
-
{"step": 1468006400, "checkpoint/checkpoint_time": 0.
|
| 8 |
-
{"step": 1677721600, "checkpoint/checkpoint_time": 0.
|
| 9 |
-
{"step": 1887436800, "checkpoint/checkpoint_time": 0.
|
|
|
|
| 1 |
+
{"step": 209715200, "checkpoint/checkpoint_time": 0.44336505798855796}
|
| 2 |
+
{"step": 419430400, "checkpoint/checkpoint_time": 0.43483636603923514}
|
| 3 |
+
{"step": 629145600, "checkpoint/checkpoint_time": 0.44907815201440826}
|
| 4 |
+
{"step": 838860800, "checkpoint/checkpoint_time": 0.45288487296784297}
|
| 5 |
+
{"step": 1048576000, "checkpoint/checkpoint_time": 0.442782363970764}
|
| 6 |
+
{"step": 1258291200, "checkpoint/checkpoint_time": 0.4494084370089695}
|
| 7 |
+
{"step": 1468006400, "checkpoint/checkpoint_time": 0.4516124309739098}
|
| 8 |
+
{"step": 1677721600, "checkpoint/checkpoint_time": 0.43384581699501723}
|
| 9 |
+
{"step": 1887436800, "checkpoint/checkpoint_time": 0.4421905800118111}
|
metrics/jsonlines/norm.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/throughput.jsonl
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metrics/jsonlines/train.jsonl
CHANGED
|
@@ -1,98 +1,98 @@
|
|
| 1 |
-
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time":
|
| 2 |
-
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time":
|
| 3 |
-
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time":
|
| 4 |
-
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time":
|
| 5 |
-
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time":
|
| 6 |
-
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time":
|
| 7 |
-
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time":
|
| 8 |
-
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time":
|
| 9 |
-
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time":
|
| 10 |
-
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time":
|
| 11 |
-
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time":
|
| 12 |
-
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time":
|
| 13 |
-
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time":
|
| 14 |
-
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time":
|
| 15 |
-
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time":
|
| 16 |
-
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time":
|
| 17 |
-
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time":
|
| 18 |
-
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time":
|
| 19 |
-
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time":
|
| 20 |
-
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time":
|
| 21 |
-
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time":
|
| 22 |
-
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time":
|
| 23 |
-
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time":
|
| 24 |
-
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time":
|
| 25 |
-
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time":
|
| 26 |
-
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time":
|
| 27 |
-
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time":
|
| 28 |
-
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time":
|
| 29 |
-
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time":
|
| 30 |
-
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time":
|
| 31 |
-
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time":
|
| 32 |
-
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time":
|
| 33 |
-
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time":
|
| 34 |
-
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time":
|
| 35 |
-
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time":
|
| 36 |
-
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time":
|
| 37 |
-
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time":
|
| 38 |
-
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time":
|
| 39 |
-
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time":
|
| 40 |
-
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time":
|
| 41 |
-
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time":
|
| 42 |
-
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time":
|
| 43 |
-
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time":
|
| 44 |
-
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time":
|
| 45 |
-
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time":
|
| 46 |
-
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time":
|
| 47 |
-
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time":
|
| 48 |
-
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time":
|
| 49 |
-
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time":
|
| 50 |
-
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time":
|
| 51 |
-
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time":
|
| 52 |
-
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time":
|
| 53 |
-
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time":
|
| 54 |
-
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time":
|
| 55 |
-
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time":
|
| 56 |
-
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time":
|
| 57 |
-
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time":
|
| 58 |
-
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time":
|
| 59 |
-
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time":
|
| 60 |
-
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time":
|
| 61 |
-
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time":
|
| 62 |
-
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time":
|
| 63 |
-
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time":
|
| 64 |
-
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time":
|
| 65 |
-
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time":
|
| 66 |
-
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time":
|
| 67 |
-
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time":
|
| 68 |
-
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time":
|
| 69 |
-
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time":
|
| 70 |
-
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time":
|
| 71 |
-
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time":
|
| 72 |
-
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time":
|
| 73 |
-
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time":
|
| 74 |
-
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time":
|
| 75 |
-
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time":
|
| 76 |
-
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time":
|
| 77 |
-
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time":
|
| 78 |
-
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time":
|
| 79 |
-
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time":
|
| 80 |
-
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time":
|
| 81 |
-
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time":
|
| 82 |
-
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time":
|
| 83 |
-
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time":
|
| 84 |
-
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time":
|
| 85 |
-
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time":
|
| 86 |
-
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time":
|
| 87 |
-
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time":
|
| 88 |
-
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time":
|
| 89 |
-
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time":
|
| 90 |
-
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time":
|
| 91 |
-
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time":
|
| 92 |
-
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time":
|
| 93 |
-
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time":
|
| 94 |
-
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time":
|
| 95 |
-
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time":
|
| 96 |
-
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time":
|
| 97 |
-
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time":
|
| 98 |
-
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time":
|
|
|
|
| 1 |
+
{"step": 20971520, "train/token_count": 20971520, "train/batch_count": 10, "train/flop_count": 0, "train/total_time": 57.12486812804127, "train/update_time": 56.932655182085, "train/lr": 0.0009000000000000001, "train/loss": 10.077424049377441, "train/global_grad_norm": 1.0569149255752563}
|
| 2 |
+
{"step": 41943040, "train/token_count": 41943040, "train/batch_count": 20, "train/flop_count": 0, "train/total_time": 110.51794190204237, "train/update_time": 110.19485108501976, "train/lr": 0.0009997960964140947, "train/loss": 8.169595718383789, "train/global_grad_norm": 0.6573789119720459}
|
| 3 |
+
{"step": 62914560, "train/token_count": 62914560, "train/batch_count": 30, "train/flop_count": 0, "train/total_time": 263.8700318510528, "train/update_time": 163.45965232816525, "train/lr": 0.0009990914580222257, "train/loss": 7.759955406188965, "train/global_grad_norm": 0.28603488206863403}
|
| 4 |
+
{"step": 83886080, "train/token_count": 83886080, "train/batch_count": 40, "train/flop_count": 0, "train/total_time": 317.24918574804906, "train/update_time": 216.73109497816768, "train/lr": 0.0009978842768382998, "train/loss": 7.5351409912109375, "train/global_grad_norm": 0.2373751848936081}
|
| 5 |
+
{"step": 104857600, "train/token_count": 104857600, "train/batch_count": 50, "train/flop_count": 0, "train/total_time": 470.2395834400086, "train/update_time": 270.0089025082416, "train/lr": 0.0009961757683914405, "train/loss": 7.356375694274902, "train/global_grad_norm": 0.25599583983421326}
|
| 6 |
+
{"step": 125829120, "train/token_count": 125829120, "train/batch_count": 60, "train/flop_count": 0, "train/total_time": 523.6280801940011, "train/update_time": 323.28231063415296, "train/lr": 0.00099396765300483, "train/loss": 7.169342041015625, "train/global_grad_norm": 0.21762162446975708}
|
| 7 |
+
{"step": 146800640, "train/token_count": 146800640, "train/batch_count": 70, "train/flop_count": 0, "train/total_time": 676.4629730410525, "train/update_time": 376.5575643811608, "train/lr": 0.0009912621540634887, "train/loss": 7.04250955581665, "train/global_grad_norm": 0.17491649091243744}
|
| 8 |
+
{"step": 167772160, "train/token_count": 167772160, "train/batch_count": 80, "train/flop_count": 0, "train/total_time": 729.8428020050051, "train/update_time": 429.83724463917315, "train/lr": 0.000988061995775515, "train/loss": 6.879690647125244, "train/global_grad_norm": 0.17261892557144165}
|
| 9 |
+
{"step": 188743680, "train/token_count": 188743680, "train/batch_count": 90, "train/flop_count": 0, "train/total_time": 884.0807184120058, "train/update_time": 483.11551754607353, "train/lr": 0.0009843704004290394, "train/loss": 6.732751369476318, "train/global_grad_norm": 0.32638832926750183}
|
| 10 |
+
{"step": 209715200, "train/token_count": 209715200, "train/batch_count": 100, "train/flop_count": 0, "train/total_time": 937.4650811910396, "train/update_time": 536.3882422860479, "train/lr": 0.0009801910851476522, "train/loss": 6.633055210113525, "train/global_grad_norm": 0.18298597633838654}
|
| 11 |
+
{"step": 230686720, "train/token_count": 230686720, "train/batch_count": 110, "train/flop_count": 0, "train/total_time": 1091.3179251340334, "train/update_time": 589.6708546730806, "train/lr": 0.0009755282581475768, "train/loss": 6.5601725578308105, "train/global_grad_norm": 0.7863500714302063}
|
| 12 |
+
{"step": 251658240, "train/token_count": 251658240, "train/batch_count": 120, "train/flop_count": 0, "train/total_time": 1144.6911778100184, "train/update_time": 642.9422557381331, "train/lr": 0.0009703866145003512, "train/loss": 6.427718162536621, "train/global_grad_norm": 0.2532098889350891}
|
| 13 |
+
{"step": 272629760, "train/token_count": 272629760, "train/batch_count": 130, "train/flop_count": 0, "train/total_time": 1298.4373718530405, "train/update_time": 696.2163101581973, "train/lr": 0.0009647713314052896, "train/loss": 6.381680488586426, "train/global_grad_norm": 0.1815725564956665}
|
| 14 |
+
{"step": 293601280, "train/token_count": 293601280, "train/batch_count": 140, "train/flop_count": 0, "train/total_time": 1351.811079526029, "train/update_time": 749.4968144011218, "train/lr": 0.0009586880629764817, "train/loss": 6.308362007141113, "train/global_grad_norm": 0.25452741980552673}
|
| 15 |
+
{"step": 314572800, "train/token_count": 314572800, "train/batch_count": 150, "train/flop_count": 0, "train/total_time": 1504.7933711430524, "train/update_time": 802.7697926640394, "train/lr": 0.0009521429345495787, "train/loss": 6.192831039428711, "train/global_grad_norm": 0.2731724679470062}
|
| 16 |
+
{"step": 335544320, "train/token_count": 335544320, "train/batch_count": 160, "train/flop_count": 0, "train/total_time": 1558.167400816048, "train/update_time": 856.0381595880608, "train/lr": 0.0009451425365140996, "train/loss": 6.160712242126465, "train/global_grad_norm": 0.25031647086143494}
|
| 17 |
+
{"step": 356515840, "train/token_count": 356515840, "train/batch_count": 170, "train/flop_count": 0, "train/total_time": 1711.111571622023, "train/update_time": 909.3183224739623, "train/lr": 0.000937693917677468, "train/loss": 6.076303005218506, "train/global_grad_norm": 0.22126874327659607}
|
| 18 |
+
{"step": 377487360, "train/token_count": 377487360, "train/batch_count": 180, "train/flop_count": 0, "train/total_time": 1764.4956908360473, "train/update_time": 962.5948072728934, "train/lr": 0.0009298045781674596, "train/loss": 6.05035400390625, "train/global_grad_norm": 0.20042574405670166}
|
| 19 |
+
{"step": 398458880, "train/token_count": 398458880, "train/batch_count": 190, "train/flop_count": 0, "train/total_time": 1917.3773277360015, "train/update_time": 1015.8676893726224, "train/lr": 0.0009214824618802108, "train/loss": 6.025995254516602, "train/global_grad_norm": 0.4825673997402191}
|
| 20 |
+
{"step": 419430400, "train/token_count": 419430400, "train/batch_count": 200, "train/flop_count": 0, "train/total_time": 1970.7506705410196, "train/update_time": 1069.132912081608, "train/lr": 0.000912735948481387, "train/loss": 5.939733505249023, "train/global_grad_norm": 0.201382115483284}
|
| 21 |
+
{"step": 440401920, "train/token_count": 440401920, "train/batch_count": 210, "train/flop_count": 0, "train/total_time": 2124.242168805038, "train/update_time": 1122.4119238386047, "train/lr": 0.0009035738449685707, "train/loss": 5.90223503112793, "train/global_grad_norm": 0.44680795073509216}
|
| 22 |
+
{"step": 461373440, "train/token_count": 461373440, "train/batch_count": 220, "train/flop_count": 0, "train/total_time": 2177.6055200890405, "train/update_time": 1175.6836349036312, "train/lr": 0.0008940053768033609, "train/loss": 5.879762172698975, "train/global_grad_norm": 0.24954979121685028}
|
| 23 |
+
{"step": 482344960, "train/token_count": 482344960, "train/batch_count": 230, "train/flop_count": 0, "train/total_time": 2330.6941529700416, "train/update_time": 1228.951100654609, "train/lr": 0.0008840401786221159, "train/loss": 5.81107234954834, "train/global_grad_norm": 0.2971765398979187}
|
| 24 |
+
{"step": 503316480, "train/token_count": 503316480, "train/batch_count": 240, "train/flop_count": 0, "train/total_time": 2384.0519924180117, "train/update_time": 1282.2238651026273, "train/lr": 0.0008736882845346905, "train/loss": 5.7556939125061035, "train/global_grad_norm": 0.2694259583950043}
|
| 25 |
+
{"step": 524288000, "train/token_count": 524288000, "train/batch_count": 250, "train/flop_count": 0, "train/total_time": 2537.098761650035, "train/update_time": 1335.4923671315191, "train/lr": 0.0008629601180209381, "train/loss": 5.76292610168457, "train/global_grad_norm": 0.3520471751689911}
|
| 26 |
+
{"step": 545259520, "train/token_count": 545259520, "train/batch_count": 260, "train/flop_count": 0, "train/total_time": 2590.4739345350536, "train/update_time": 1388.759745098534, "train/lr": 0.0008518664814351503, "train/loss": 5.717782974243164, "train/global_grad_norm": 0.4163813591003418}
|
| 27 |
+
{"step": 566231040, "train/token_count": 566231040, "train/batch_count": 270, "train/flop_count": 0, "train/total_time": 2743.4162192750373, "train/update_time": 1442.0279306704178, "train/lr": 0.0008404185451290017, "train/loss": 5.692546844482422, "train/global_grad_norm": 0.21434997022151947}
|
| 28 |
+
{"step": 587202560, "train/token_count": 587202560, "train/batch_count": 280, "train/flop_count": 0, "train/total_time": 2796.791148387012, "train/update_time": 1495.3029013883206, "train/lr": 0.0008286278362039527, "train/loss": 5.643004894256592, "train/global_grad_norm": 0.2754496932029724}
|
| 29 |
+
{"step": 608174080, "train/token_count": 608174080, "train/batch_count": 290, "train/flop_count": 0, "train/total_time": 2949.6854955510353, "train/update_time": 1548.5792963503627, "train/lr": 0.0008165062269044352, "train/loss": 5.610556125640869, "train/global_grad_norm": 0.2890426218509674}
|
| 30 |
+
{"step": 629145600, "train/token_count": 629145600, "train/batch_count": 300, "train/flop_count": 0, "train/total_time": 3003.058338998002, "train/update_time": 1601.8528411513544, "train/lr": 0.0008040659226635089, "train/loss": 5.58476448059082, "train/global_grad_norm": 0.380667507648468}
|
| 31 |
+
{"step": 650117120, "train/token_count": 650117120, "train/batch_count": 310, "train/flop_count": 0, "train/total_time": 3156.3314061540295, "train/update_time": 1655.1285377033055, "train/lr": 0.0007913194498130252, "train/loss": 5.5936055183410645, "train/global_grad_norm": 0.2591659426689148}
|
| 32 |
+
{"step": 671088640, "train/token_count": 671088640, "train/batch_count": 320, "train/flop_count": 0, "train/total_time": 3209.7035325120087, "train/update_time": 1708.4027871834696, "train/lr": 0.000778279642970672, "train/loss": 5.527544021606445, "train/global_grad_norm": 0.22509922087192535}
|
| 33 |
+
{"step": 692060160, "train/token_count": 692060160, "train/batch_count": 330, "train/flop_count": 0, "train/total_time": 3362.512193232018, "train/update_time": 1761.6806279715383, "train/lr": 0.0007649596321166025, "train/loss": 5.541203022003174, "train/global_grad_norm": 0.4492305815219879}
|
| 34 |
+
{"step": 713031680, "train/token_count": 713031680, "train/batch_count": 340, "train/flop_count": 0, "train/total_time": 3415.8845459170407, "train/update_time": 1814.9587236176012, "train/lr": 0.0007513728293726579, "train/loss": 5.501528263092041, "train/global_grad_norm": 0.3490087687969208}
|
| 35 |
+
{"step": 734003200, "train/token_count": 734003200, "train/batch_count": 350, "train/flop_count": 0, "train/total_time": 3570.6092982320115, "train/update_time": 1868.2295263125561, "train/lr": 0.0007375329154974975, "train/loss": 5.4834885597229, "train/global_grad_norm": 0.3601242005825043}
|
| 36 |
+
{"step": 754974720, "train/token_count": 754974720, "train/batch_count": 360, "train/flop_count": 0, "train/total_time": 3623.980893074011, "train/update_time": 1921.4990626386134, "train/lr": 0.0007234538261112341, "train/loss": 5.410107612609863, "train/global_grad_norm": 0.4656950533390045}
|
| 37 |
+
{"step": 775946240, "train/token_count": 775946240, "train/batch_count": 370, "train/flop_count": 0, "train/total_time": 3779.0529326410033, "train/update_time": 1974.7656041345908, "train/lr": 0.0007091497376634464, "train/loss": 5.43698263168335, "train/global_grad_norm": 0.4004105031490326}
|
| 38 |
+
{"step": 796917760, "train/token_count": 796917760, "train/batch_count": 380, "train/flop_count": 0, "train/total_time": 3832.4348147350247, "train/update_time": 2028.0436954226461, "train/lr": 0.0006946350531586958, "train/loss": 5.412634372711182, "train/global_grad_norm": 0.29114505648612976}
|
| 39 |
+
{"step": 817889280, "train/token_count": 817889280, "train/batch_count": 390, "train/flop_count": 0, "train/total_time": 3986.6730023160344, "train/update_time": 2081.339985151775, "train/lr": 0.0006799243876539214, "train/loss": 5.414259910583496, "train/global_grad_norm": 0.3105607330799103}
|
| 40 |
+
{"step": 838860800, "train/token_count": 838860800, "train/batch_count": 400, "train/flop_count": 0, "train/total_time": 4040.0474286440294, "train/update_time": 2134.611642122676, "train/lr": 0.0006650325535423166, "train/loss": 5.334737300872803, "train/global_grad_norm": 0.30427536368370056}
|
| 41 |
+
{"step": 859832320, "train/token_count": 859832320, "train/batch_count": 410, "train/flop_count": 0, "train/total_time": 4193.527929550037, "train/update_time": 2187.884143058851, "train/lr": 0.0006499745456385053, "train/loss": 5.344532489776611, "train/global_grad_norm": 0.33702728152275085}
|
| 42 |
+
{"step": 880803840, "train/token_count": 880803840, "train/batch_count": 420, "train/flop_count": 0, "train/total_time": 4246.9178847110015, "train/update_time": 2241.15703301772, "train/lr": 0.0006347655260800339, "train/loss": 5.3456807136535645, "train/global_grad_norm": 0.31774818897247314}
|
| 43 |
+
{"step": 901775360, "train/token_count": 901775360, "train/batch_count": 430, "train/flop_count": 0, "train/total_time": 4400.437362540048, "train/update_time": 2294.435530113755, "train/lr": 0.0006194208090603844, "train/loss": 5.333301544189453, "train/global_grad_norm": 0.3481753468513489}
|
| 44 |
+
{"step": 922746880, "train/token_count": 922746880, "train/batch_count": 440, "train/flop_count": 0, "train/total_time": 4453.811728182016, "train/update_time": 2347.708159423666, "train/lr": 0.0006039558454088796, "train/loss": 5.352600574493408, "train/global_grad_norm": 0.25959765911102295}
|
| 45 |
+
{"step": 943718400, "train/token_count": 943718400, "train/batch_count": 450, "train/flop_count": 0, "train/total_time": 4608.291631601052, "train/update_time": 2400.977511668694, "train/lr": 0.0005883862070330078, "train/loss": 5.296506881713867, "train/global_grad_norm": 0.3832894265651703}
|
| 46 |
+
{"step": 964689920, "train/token_count": 964689920, "train/batch_count": 460, "train/flop_count": 0, "train/total_time": 4661.668510478048, "train/update_time": 2454.2510100168292, "train/lr": 0.0005727275712388317, "train/loss": 5.275446891784668, "train/global_grad_norm": 0.42716965079307556}
|
| 47 |
+
{"step": 985661440, "train/token_count": 985661440, "train/batch_count": 470, "train/flop_count": 0, "train/total_time": 4816.396098136029, "train/update_time": 2507.5231073708273, "train/lr": 0.0005569957049452703, "train/loss": 5.291362285614014, "train/global_grad_norm": 0.382432758808136}
|
| 48 |
+
{"step": 1006632960, "train/token_count": 1006632960, "train/batch_count": 480, "train/flop_count": 0, "train/total_time": 4869.772359199007, "train/update_time": 2560.802637038869, "train/lr": 0.0005412064488081482, "train/loss": 5.263927936553955, "train/global_grad_norm": 0.2995292842388153}
|
| 49 |
+
{"step": 1027604480, "train/token_count": 1027604480, "train/batch_count": 490, "train/flop_count": 0, "train/total_time": 5024.381550224032, "train/update_time": 2614.0767399497563, "train/lr": 0.0005253757012699972, "train/loss": 5.269484043121338, "train/global_grad_norm": 0.32839062809944153}
|
| 50 |
+
{"step": 1048576000, "train/token_count": 1048576000, "train/batch_count": 500, "train/flop_count": 0, "train/total_time": 5077.749329403043, "train/update_time": 2667.341171991662, "train/lr": 0.0005095194025516734, "train/loss": 5.244964122772217, "train/global_grad_norm": 0.28380733728408813}
|
| 51 |
+
{"step": 1069547520, "train/token_count": 1069547520, "train/batch_count": 510, "train/flop_count": 0, "train/total_time": 5232.712933773, "train/update_time": 2720.6185927585466, "train/lr": 0.0004936535186019053, "train/loss": 5.242177963256836, "train/global_grad_norm": 0.3184642493724823}
|
| 52 |
+
{"step": 1090519040, "train/token_count": 1090519040, "train/batch_count": 520, "train/flop_count": 0, "train/total_time": 5286.093386778026, "train/update_time": 2773.886181908485, "train/lr": 0.00047779402502093696, "train/loss": 5.2353105545043945, "train/global_grad_norm": 0.39006081223487854}
|
| 53 |
+
{"step": 1111490560, "train/token_count": 1111490560, "train/batch_count": 530, "train/flop_count": 0, "train/total_time": 5441.220508124039, "train/update_time": 2827.1616227625636, "train/lr": 0.0004619568909744525, "train/loss": 5.193413734436035, "train/global_grad_norm": 0.33428674936294556}
|
| 54 |
+
{"step": 1132462080, "train/token_count": 1132462080, "train/batch_count": 540, "train/flop_count": 0, "train/total_time": 5494.618768353015, "train/update_time": 2880.4332658784697, "train/lr": 0.00044615806311398067, "train/loss": 5.208868503570557, "train/global_grad_norm": 0.331386536359787}
|
| 55 |
+
{"step": 1153433600, "train/token_count": 1153433600, "train/batch_count": 550, "train/flop_count": 0, "train/total_time": 5649.622147237009, "train/update_time": 2933.705731303431, "train/lr": 0.0004304134495199673, "train/loss": 5.155785083770752, "train/global_grad_norm": 0.31903383135795593}
|
| 56 |
+
{"step": 1174405120, "train/token_count": 1174405120, "train/batch_count": 560, "train/flop_count": 0, "train/total_time": 5703.016606429999, "train/update_time": 2986.9915024373913, "train/lr": 0.0004147389036836882, "train/loss": 5.194952011108398, "train/global_grad_norm": 0.2875197231769562}
|
| 57 |
+
{"step": 1195376640, "train/token_count": 1195376640, "train/batch_count": 570, "train/flop_count": 0, "train/total_time": 5856.936125319044, "train/update_time": 3040.2605143213877, "train/lr": 0.0003991502085441259, "train/loss": 5.166281223297119, "train/global_grad_norm": 0.2693222165107727}
|
| 58 |
+
{"step": 1216348160, "train/token_count": 1216348160, "train/batch_count": 580, "train/flop_count": 0, "train/total_time": 5910.318671693036, "train/update_time": 3093.5336661074543, "train/lr": 0.0003836630605958888, "train/loss": 5.196907043457031, "train/global_grad_norm": 0.28884732723236084}
|
| 59 |
+
{"step": 1237319680, "train/token_count": 1237319680, "train/batch_count": 590, "train/flop_count": 0, "train/total_time": 6063.751631618012, "train/update_time": 3146.8101018704474, "train/lr": 0.00036829305408417155, "train/loss": 5.1710686683654785, "train/global_grad_norm": 0.2542065382003784}
|
| 60 |
+
{"step": 1258291200, "train/token_count": 1258291200, "train/batch_count": 600, "train/flop_count": 0, "train/total_time": 6117.130997166038, "train/update_time": 3200.0810677845147, "train/lr": 0.000353055665302672, "train/loss": 5.1745829582214355, "train/global_grad_norm": 0.2485460489988327}
|
| 61 |
+
{"step": 1279262720, "train/token_count": 1279262720, "train/batch_count": 610, "train/flop_count": 0, "train/total_time": 6270.89233304502, "train/update_time": 3253.376109398436, "train/lr": 0.0003379662370102746, "train/loss": 5.1606316566467285, "train/global_grad_norm": 0.24228136241436005}
|
| 62 |
+
{"step": 1300234240, "train/token_count": 1300234240, "train/batch_count": 620, "train/flop_count": 0, "train/total_time": 6324.271360302053, "train/update_time": 3306.647373120475, "train/lr": 0.00032303996298219405, "train/loss": 5.135626316070557, "train/global_grad_norm": 0.297547847032547}
|
| 63 |
+
{"step": 1321205760, "train/token_count": 1321205760, "train/batch_count": 630, "train/flop_count": 0, "train/total_time": 6477.804279661039, "train/update_time": 3359.92066662648, "train/lr": 0.00030829187271113034, "train/loss": 5.145682334899902, "train/global_grad_norm": 0.2230217158794403}
|
| 64 |
+
{"step": 1342177280, "train/token_count": 1342177280, "train/batch_count": 640, "train/flop_count": 0, "train/total_time": 6531.185974933032, "train/update_time": 3413.19108713849, "train/lr": 0.0002937368162738445, "train/loss": 5.152654647827148, "train/global_grad_norm": 0.2998616695404053}
|
| 65 |
+
{"step": 1363148800, "train/token_count": 1363148800, "train/batch_count": 650, "train/flop_count": 0, "train/total_time": 6684.524030425004, "train/update_time": 3466.4617998044705, "train/lr": 0.0002793894493783894, "train/loss": 5.121702194213867, "train/global_grad_norm": 0.24779871106147766}
|
| 66 |
+
{"step": 1384120320, "train/token_count": 1384120320, "train/batch_count": 660, "train/flop_count": 0, "train/total_time": 6737.915407613036, "train/update_time": 3519.7321525084553, "train/lr": 0.00026526421860705474, "train/loss": 5.143693447113037, "train/global_grad_norm": 0.25348708033561707}
|
| 67 |
+
{"step": 1405091840, "train/token_count": 1405091840, "train/batch_count": 670, "train/flop_count": 0, "train/total_time": 6891.055202779011, "train/update_time": 3573.0133047814597, "train/lr": 0.0002513753468698824, "train/loss": 5.096343040466309, "train/global_grad_norm": 0.23598778247833252}
|
| 68 |
+
{"step": 1426063360, "train/token_count": 1426063360, "train/batch_count": 680, "train/flop_count": 0, "train/total_time": 6944.4543808570015, "train/update_time": 3626.2884129853337, "train/lr": 0.00023773681908340283, "train/loss": 5.104409217834473, "train/global_grad_norm": 0.25248244404792786}
|
| 69 |
+
{"step": 1447034880, "train/token_count": 1447034880, "train/batch_count": 690, "train/flop_count": 0, "train/total_time": 7097.769050646049, "train/update_time": 3679.5599112784257, "train/lr": 0.00022436236808900823, "train/loss": 5.114173889160156, "train/global_grad_norm": 0.1796308010816574}
|
| 70 |
+
{"step": 1468006400, "train/token_count": 1468006400, "train/batch_count": 700, "train/flop_count": 0, "train/total_time": 7151.144483595039, "train/update_time": 3732.833151328552, "train/lr": 0.00021126546082514682, "train/loss": 5.100642681121826, "train/global_grad_norm": 0.21071678400039673}
|
| 71 |
+
{"step": 1488977920, "train/token_count": 1488977920, "train/batch_count": 710, "train/flop_count": 0, "train/total_time": 7304.954160230001, "train/update_time": 3786.1120017025387, "train/lr": 0.00019845928476725522, "train/loss": 5.092477321624756, "train/global_grad_norm": 0.2642405927181244}
|
| 72 |
+
{"step": 1509949440, "train/token_count": 1509949440, "train/batch_count": 720, "train/flop_count": 0, "train/total_time": 7358.337214609026, "train/update_time": 3839.3859579174896, "train/lr": 0.0001859567346490913, "train/loss": 5.1006293296813965, "train/global_grad_norm": 0.2527276873588562}
|
| 73 |
+
{"step": 1530920960, "train/token_count": 1530920960, "train/batch_count": 730, "train/flop_count": 0, "train/total_time": 7511.621099121054, "train/update_time": 3892.6592194504337, "train/lr": 0.00017377039947882782, "train/loss": 5.108747959136963, "train/global_grad_norm": 0.18992801010608673}
|
| 74 |
+
{"step": 1551892480, "train/token_count": 1551892480, "train/batch_count": 740, "train/flop_count": 0, "train/total_time": 7565.03086849401, "train/update_time": 3945.9412919793394, "train/lr": 0.00016191254986299043, "train/loss": 5.105123043060303, "train/global_grad_norm": 0.19203054904937744}
|
| 75 |
+
{"step": 1572864000, "train/token_count": 1572864000, "train/batch_count": 750, "train/flop_count": 0, "train/total_time": 7718.234012908011, "train/update_time": 3999.225145033328, "train/lr": 0.00015039512565099468, "train/loss": 5.096941947937012, "train/global_grad_norm": 0.18585249781608582}
|
| 76 |
+
{"step": 1593835520, "train/token_count": 1593835520, "train/batch_count": 760, "train/flop_count": 0, "train/total_time": 7771.636799781001, "train/update_time": 4052.503600837372, "train/lr": 0.00013922972391273224, "train/loss": 5.052731990814209, "train/global_grad_norm": 0.20207689702510834}
|
| 77 |
+
{"step": 1614807040, "train/token_count": 1614807040, "train/batch_count": 770, "train/flop_count": 0, "train/total_time": 7924.894128009037, "train/update_time": 4105.784444000397, "train/lr": 0.00012842758726130281, "train/loss": 5.112724304199219, "train/global_grad_norm": 0.20703168213367462}
|
| 78 |
+
{"step": 1635778560, "train/token_count": 1635778560, "train/batch_count": 780, "train/flop_count": 0, "train/total_time": 7978.290488699044, "train/update_time": 4159.058140857378, "train/lr": 0.00011799959253265679, "train/loss": 5.0365166664123535, "train/global_grad_norm": 0.19214113056659698}
|
| 79 |
+
{"step": 1656750080, "train/token_count": 1656750080, "train/batch_count": 790, "train/flop_count": 0, "train/total_time": 8133.211715042999, "train/update_time": 4212.332782215439, "train/lr": 0.00010795623983354214, "train/loss": 5.071290016174316, "train/global_grad_norm": 0.18038234114646912}
|
| 80 |
+
{"step": 1677721600, "train/token_count": 1677721600, "train/batch_count": 800, "train/flop_count": 0, "train/total_time": 8186.5969784220215, "train/update_time": 4265.60545003548, "train/lr": 9.830764196878872e-05, "train/loss": 5.038881301879883, "train/global_grad_norm": 0.21081118285655975}
|
| 81 |
+
{"step": 1698693120, "train/token_count": 1698693120, "train/batch_count": 810, "train/flop_count": 0, "train/total_time": 8342.015340365004, "train/update_time": 4318.878572056361, "train/lr": 8.906351425856951e-05, "train/loss": 5.020392894744873, "train/global_grad_norm": 0.14397121965885162}
|
| 82 |
+
{"step": 1719664640, "train/token_count": 1719664640, "train/batch_count": 820, "train/flop_count": 0, "train/total_time": 8395.40680388402, "train/update_time": 4372.159924876352, "train/lr": 8.02331647558977e-05, "train/loss": 5.033623695373535, "train/global_grad_norm": 0.20169667899608612}
|
| 83 |
+
{"step": 1740636160, "train/token_count": 1740636160, "train/batch_count": 830, "train/flop_count": 0, "train/total_time": 8550.23874416505, "train/update_time": 4425.43352887634, "train/lr": 7.182548487420554e-05, "train/loss": 5.0619354248046875, "train/global_grad_norm": 0.1709950715303421}
|
| 84 |
+
{"step": 1761607680, "train/token_count": 1761607680, "train/batch_count": 840, "train/flop_count": 0, "train/total_time": 8603.637448858004, "train/update_time": 4478.715910169412, "train/lr": 6.384894043444556e-05, "train/loss": 5.039224147796631, "train/global_grad_norm": 0.14815856516361237}
|
| 85 |
+
{"step": 1782579200, "train/token_count": 1782579200, "train/batch_count": 850, "train/flop_count": 0, "train/total_time": 8756.833522661007, "train/update_time": 4531.9930339534185, "train/lr": 5.6311563140726166e-05, "train/loss": 5.098855495452881, "train/global_grad_norm": 0.14111174643039703}
|
| 86 |
+
{"step": 1803550720, "train/token_count": 1803550720, "train/batch_count": 860, "train/flop_count": 0, "train/total_time": 8810.230298971, "train/update_time": 4585.266904477379, "train/lr": 4.922094249306547e-05, "train/loss": 5.075422286987305, "train/global_grad_norm": 0.1590386927127838}
|
| 87 |
+
{"step": 1824522240, "train/token_count": 1824522240, "train/batch_count": 870, "train/flop_count": 0, "train/total_time": 8963.212115761009, "train/update_time": 4638.541050948435, "train/lr": 4.2584218145409916e-05, "train/loss": 5.018501281738281, "train/global_grad_norm": 0.14633402228355408}
|
| 88 |
+
{"step": 1845493760, "train/token_count": 1845493760, "train/batch_count": 880, "train/flop_count": 0, "train/total_time": 9016.607044072007, "train/update_time": 4691.8174923404, "train/lr": 3.6408072716606236e-05, "train/loss": 5.05587911605835, "train/global_grad_norm": 0.1401192992925644}
|
| 89 |
+
{"step": 1866465280, "train/token_count": 1866465280, "train/batch_count": 890, "train/flop_count": 0, "train/total_time": 9169.565102360037, "train/update_time": 4745.092234898475, "train/lr": 3.069872506157217e-05, "train/loss": 5.094274044036865, "train/global_grad_norm": 0.14165186882019043}
|
| 90 |
+
{"step": 1887436800, "train/token_count": 1887436800, "train/batch_count": 900, "train/flop_count": 0, "train/total_time": 9222.955616571999, "train/update_time": 4798.3678903255495, "train/lr": 2.5461924009435368e-05, "train/loss": 5.02672815322876, "train/global_grad_norm": 0.1492566168308258}
|
| 91 |
+
{"step": 1908408320, "train/token_count": 1908408320, "train/batch_count": 910, "train/flop_count": 0, "train/total_time": 9376.841395194002, "train/update_time": 4851.6424762834795, "train/lr": 2.0702942574950812e-05, "train/loss": 5.048681735992432, "train/global_grad_norm": 0.1326991319656372}
|
| 92 |
+
{"step": 1929379840, "train/token_count": 1929379840, "train/batch_count": 920, "train/flop_count": 0, "train/total_time": 9430.23481100105, "train/update_time": 4904.91280556639, "train/lr": 1.642657264902142e-05, "train/loss": 5.089313983917236, "train/global_grad_norm": 0.1482771933078766}
|
| 93 |
+
{"step": 1950351360, "train/token_count": 1950351360, "train/batch_count": 930, "train/flop_count": 0, "train/total_time": 9583.694742296997, "train/update_time": 4958.171936342376, "train/lr": 1.2637120173670358e-05, "train/loss": 5.060367584228516, "train/global_grad_norm": 0.13109560310840607}
|
| 94 |
+
{"step": 1971322880, "train/token_count": 1971322880, "train/batch_count": 940, "train/flop_count": 0, "train/total_time": 9637.072636537021, "train/update_time": 5011.441395019239, "train/lr": 9.338400806321978e-06, "train/loss": 5.0208845138549805, "train/global_grad_norm": 0.12541547417640686}
|
| 95 |
+
{"step": 1992294400, "train/token_count": 1992294400, "train/batch_count": 950, "train/flop_count": 0, "train/total_time": 9791.05162328505, "train/update_time": 5064.716082916246, "train/lr": 6.533736077758867e-06, "train/loss": 5.054846286773682, "train/global_grad_norm": 0.12279438227415085}
|
| 96 |
+
{"step": 2013265920, "train/token_count": 2013265920, "train/batch_count": 960, "train/flop_count": 0, "train/total_time": 9844.444105764036, "train/update_time": 5117.988080174255, "train/lr": 4.2259500476214406e-06, "train/loss": 5.061497211456299, "train/global_grad_norm": 0.11868078261613846}
|
| 97 |
+
{"step": 2034237440, "train/token_count": 2034237440, "train/batch_count": 970, "train/flop_count": 0, "train/total_time": 9997.457464576, "train/update_time": 5171.263283661159, "train/lr": 2.417366460819359e-06, "train/loss": 5.073247909545898, "train/global_grad_norm": 0.12584054470062256}
|
| 98 |
+
{"step": 2055208960, "train/token_count": 2055208960, "train/batch_count": 980, "train/flop_count": 0, "train/total_time": 10050.842236216005, "train/update_time": 5224.530718925176, "train/lr": 1.1098064077174619e-06, "train/loss": 5.0577826499938965, "train/global_grad_norm": 0.11671043187379837}
|
metrics/jsonlines/train_eval.jsonl
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
-
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 2 |
-
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 3 |
-
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 4 |
-
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 5 |
-
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 6 |
-
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 7 |
-
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 8 |
-
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 9 |
-
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 10 |
-
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 11 |
-
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 12 |
-
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 13 |
-
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 14 |
-
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 15 |
-
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 16 |
-
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 17 |
-
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 18 |
-
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
| 19 |
-
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time":
|
|
|
|
| 1 |
+
{"step": 104857600, "train_eval/train_token_count": 104857600, "train_eval/train_batch_count": 50, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 470.2395834400086, "train_eval/train_update_time": 270.0089025082416, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 8.520986258850897, "train_eval/perplexity_len_2048": 5019.001351094721, "train_eval/loss_avg_len_1024": 8.522497836221008, "train_eval/perplexity_len_1024": 5026.593696720888, "train_eval/loss_avg_len_512": 8.524103445280343, "train_eval/perplexity_len_512": 5034.6709237971745}
|
| 2 |
+
{"step": 209715200, "train_eval/train_token_count": 209715200, "train_eval/train_batch_count": 100, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 937.4650811910396, "train_eval/train_update_time": 536.3882422860479, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.966422270688636, "train_eval/perplexity_len_2048": 1060.4220525980056, "train_eval/loss_avg_len_1024": 6.9722344631998565, "train_eval/perplexity_len_1024": 1066.6033758299184, "train_eval/loss_avg_len_512": 6.982165958659024, "train_eval/perplexity_len_512": 1077.249118985535}
|
| 3 |
+
{"step": 314572800, "train_eval/train_token_count": 314572800, "train_eval/train_batch_count": 150, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1504.7933711430524, "train_eval/train_update_time": 802.7697926640394, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.408694662143243, "train_eval/perplexity_len_2048": 607.100692098556, "train_eval/loss_avg_len_1024": 6.417305716021801, "train_eval/perplexity_len_1024": 612.3510419477873, "train_eval/loss_avg_len_512": 6.434781106839073, "train_eval/perplexity_len_512": 623.1461654998286}
|
| 4 |
+
{"step": 419430400, "train_eval/train_token_count": 419430400, "train_eval/train_batch_count": 200, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 1970.7506705410196, "train_eval/train_update_time": 1069.132912081608, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 6.078357431389886, "train_eval/perplexity_len_2048": 436.31193351807616, "train_eval/loss_avg_len_1024": 6.088771626625675, "train_eval/perplexity_len_1024": 440.87951373197876, "train_eval/loss_avg_len_512": 6.107667324735084, "train_eval/perplexity_len_512": 449.28944546444586}
|
| 5 |
+
{"step": 524288000, "train_eval/train_token_count": 524288000, "train_eval/train_batch_count": 250, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 2537.098761650035, "train_eval/train_update_time": 1335.4923671315191, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.844426090481538, "train_eval/perplexity_len_2048": 345.3043114722714, "train_eval/loss_avg_len_1024": 5.8543462534093855, "train_eval/perplexity_len_1024": 348.746833459867, "train_eval/loss_avg_len_512": 5.874322384678671, "train_eval/perplexity_len_512": 355.78349462406976}
|
| 6 |
+
{"step": 629145600, "train_eval/train_token_count": 629145600, "train_eval/train_batch_count": 300, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3003.058338998002, "train_eval/train_update_time": 1601.8528411513544, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.678305028288451, "train_eval/perplexity_len_2048": 292.4533094997349, "train_eval/loss_avg_len_1024": 5.688276190591132, "train_eval/perplexity_len_1024": 295.38399580743766, "train_eval/loss_avg_len_512": 5.709491532435932, "train_eval/perplexity_len_512": 301.71761565218657}
|
| 7 |
+
{"step": 734003200, "train_eval/train_token_count": 734003200, "train_eval/train_batch_count": 350, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 3570.6092982320115, "train_eval/train_update_time": 1868.2295263125561, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.536149771192977, "train_eval/perplexity_len_2048": 253.69931617483144, "train_eval/loss_avg_len_1024": 5.548100212041172, "train_eval/perplexity_len_1024": 256.74932301083805, "train_eval/loss_avg_len_512": 5.5710370129648075, "train_eval/perplexity_len_512": 262.7063879626648}
|
| 8 |
+
{"step": 838860800, "train_eval/train_token_count": 838860800, "train_eval/train_batch_count": 400, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4040.0474286440294, "train_eval/train_update_time": 2134.611642122676, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.429012276458179, "train_eval/perplexity_len_2048": 227.92400829756792, "train_eval/loss_avg_len_1024": 5.44054473506887, "train_eval/perplexity_len_1024": 230.56774759580898, "train_eval/loss_avg_len_512": 5.463790042017644, "train_eval/perplexity_len_512": 235.9901441359267}
|
| 9 |
+
{"step": 943718400, "train_eval/train_token_count": 943718400, "train_eval/train_batch_count": 450, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 4608.291631601052, "train_eval/train_update_time": 2400.977511668694, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.341378729817461, "train_eval/perplexity_len_2048": 208.8003912516811, "train_eval/loss_avg_len_1024": 5.3547878493463585, "train_eval/perplexity_len_1024": 211.61907646489718, "train_eval/loss_avg_len_512": 5.3801865651574925, "train_eval/perplexity_len_512": 217.0627679970756}
|
| 10 |
+
{"step": 1048576000, "train_eval/train_token_count": 1048576000, "train_eval/train_batch_count": 500, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5077.749329403043, "train_eval/train_update_time": 2667.341171991662, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.274126975246473, "train_eval/perplexity_len_2048": 195.21997022888468, "train_eval/loss_avg_len_1024": 5.286933790515396, "train_eval/perplexity_len_1024": 197.73619434175876, "train_eval/loss_avg_len_512": 5.313358115418087, "train_eval/perplexity_len_512": 203.0308860337738}
|
| 11 |
+
{"step": 1153433600, "train_eval/train_token_count": 1153433600, "train_eval/train_batch_count": 550, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 5649.622147237009, "train_eval/train_update_time": 2933.705731303431, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.216920477526092, "train_eval/perplexity_len_2048": 184.36555110302936, "train_eval/loss_avg_len_1024": 5.227359136996128, "train_eval/perplexity_len_1024": 186.30016010416225, "train_eval/loss_avg_len_512": 5.251418407165692, "train_eval/perplexity_len_512": 190.83676080814456}
|
| 12 |
+
{"step": 1258291200, "train_eval/train_token_count": 1258291200, "train_eval/train_batch_count": 600, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6117.130997166038, "train_eval/train_update_time": 3200.0810677845147, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.171960423406071, "train_eval/perplexity_len_2048": 176.26004331317833, "train_eval/loss_avg_len_1024": 5.182277223829406, "train_eval/perplexity_len_1024": 178.0878955837198, "train_eval/loss_avg_len_512": 5.207336967919982, "train_eval/perplexity_len_512": 182.60712149488162}
|
| 13 |
+
{"step": 1363148800, "train_eval/train_token_count": 1363148800, "train_eval/train_batch_count": 650, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 6684.524030425004, "train_eval/train_update_time": 3466.4617998044705, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.14503268874938, "train_eval/perplexity_len_2048": 171.57709321999303, "train_eval/loss_avg_len_1024": 5.159728246359883, "train_eval/perplexity_len_1024": 174.11713221076468, "train_eval/loss_avg_len_512": 5.184838373988023, "train_eval/perplexity_len_512": 178.54459000759974}
|
| 14 |
+
{"step": 1468006400, "train_eval/train_token_count": 1468006400, "train_eval/train_batch_count": 700, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7151.144483595039, "train_eval/train_update_time": 3732.833151328552, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.112421451262962, "train_eval/perplexity_len_2048": 166.07200365399407, "train_eval/loss_avg_len_1024": 5.126486331900814, "train_eval/perplexity_len_1024": 168.4242900992313, "train_eval/loss_avg_len_512": 5.1545387507501434, "train_eval/perplexity_len_512": 173.21589262959722}
|
| 15 |
+
{"step": 1572864000, "train_eval/train_token_count": 1572864000, "train_eval/train_batch_count": 750, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 7718.234012908011, "train_eval/train_update_time": 3999.225145033328, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.092018521135159, "train_eval/perplexity_len_2048": 162.71798046774887, "train_eval/loss_avg_len_1024": 5.106942968778494, "train_eval/perplexity_len_1024": 165.16466876932896, "train_eval/loss_avg_len_512": 5.13422550390591, "train_eval/perplexity_len_512": 169.73281155532834}
|
| 16 |
+
{"step": 1677721600, "train_eval/train_token_count": 1677721600, "train_eval/train_batch_count": 800, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8186.5969784220215, "train_eval/train_update_time": 4265.60545003548, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.076504996041095, "train_eval/perplexity_len_2048": 160.21313075547536, "train_eval/loss_avg_len_1024": 5.0923651870700635, "train_eval/perplexity_len_1024": 162.77439902720477, "train_eval/loss_avg_len_512": 5.119650275185922, "train_eval/perplexity_len_512": 167.27685852206267}
|
| 17 |
+
{"step": 1782579200, "train_eval/train_token_count": 1782579200, "train_eval/train_batch_count": 850, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 8756.833522661007, "train_eval/train_update_time": 4531.9930339534185, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.060602921242007, "train_eval/perplexity_len_2048": 157.68555964213598, "train_eval/loss_avg_len_1024": 5.0714183489211795, "train_eval/perplexity_len_1024": 159.40025226107093, "train_eval/loss_avg_len_512": 5.098776889980873, "train_eval/perplexity_len_512": 163.82141309995419}
|
| 18 |
+
{"step": 1887436800, "train_eval/train_token_count": 1887436800, "train_eval/train_batch_count": 900, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9222.955616571999, "train_eval/train_update_time": 4798.3678903255495, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.060850694276305, "train_eval/perplexity_len_2048": 157.7246347123879, "train_eval/loss_avg_len_1024": 5.075187498527157, "train_eval/perplexity_len_1024": 160.00218934191147, "train_eval/loss_avg_len_512": 5.102414626430763, "train_eval/perplexity_len_512": 164.4184374759526}
|
| 19 |
+
{"step": 1992294400, "train_eval/train_token_count": 1992294400, "train_eval/train_batch_count": 950, "train_eval/train_flop_count": 0, "train_eval/train_total_time": 9791.05162328505, "train_eval/train_update_time": 5064.716082916246, "train_eval/window_seq_count": 51200, "train_eval/window_token_count": 104857600, "train_eval/loss_avg_len_2048": 5.051834876534377, "train_eval/perplexity_len_2048": 156.3090092520558, "train_eval/loss_avg_len_1024": 5.0681735769458465, "train_eval/perplexity_len_1024": 158.88387300919587, "train_eval/loss_avg_len_512": 5.094969020260178, "train_eval/perplexity_len_512": 163.19878869005936}
|
metrics/jsonlines/val.jsonl
CHANGED
|
@@ -1,49 +1,49 @@
|
|
| 1 |
-
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time":
|
| 2 |
-
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time":
|
| 3 |
-
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time":
|
| 4 |
-
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time":
|
| 5 |
-
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time":
|
| 6 |
-
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time":
|
| 7 |
-
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time":
|
| 8 |
-
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time":
|
| 9 |
-
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time":
|
| 10 |
-
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time":
|
| 11 |
-
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time":
|
| 12 |
-
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time":
|
| 13 |
-
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time":
|
| 14 |
-
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time":
|
| 15 |
-
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time":
|
| 16 |
-
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time":
|
| 17 |
-
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time":
|
| 18 |
-
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time":
|
| 19 |
-
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time":
|
| 20 |
-
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time":
|
| 21 |
-
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time":
|
| 22 |
-
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time":
|
| 23 |
-
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time":
|
| 24 |
-
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time":
|
| 25 |
-
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time":
|
| 26 |
-
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time":
|
| 27 |
-
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time":
|
| 28 |
-
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time":
|
| 29 |
-
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time":
|
| 30 |
-
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time":
|
| 31 |
-
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time":
|
| 32 |
-
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time":
|
| 33 |
-
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time":
|
| 34 |
-
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time":
|
| 35 |
-
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time":
|
| 36 |
-
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time":
|
| 37 |
-
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time":
|
| 38 |
-
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time":
|
| 39 |
-
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time":
|
| 40 |
-
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time":
|
| 41 |
-
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time":
|
| 42 |
-
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time":
|
| 43 |
-
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time":
|
| 44 |
-
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time":
|
| 45 |
-
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time":
|
| 46 |
-
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time":
|
| 47 |
-
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time":
|
| 48 |
-
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time":
|
| 49 |
-
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time":
|
|
|
|
| 1 |
+
{"step": 41943040, "val/train_token_count": 41943040, "val/train_batch_count": 20, "val/train_flop_count": 0, "val/train_total_time": 110.51794190204237, "val/train_update_time": 110.19485108501976, "val/loss": 8.072670181155205, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.9716204389697, "val/val_tokens_per_second": 409716.2756805078, "val/loss_avg_len_2048": 8.072670181155205, "val/perplexity_len_2048": 3205.6500777398833, "val/loss_avg_len_1024": 8.071338223218918, "val/perplexity_len_1024": 3201.383129006829, "val/loss_avg_len_512": 8.071987850761413, "val/perplexity_len_512": 3203.463511325176}
|
| 2 |
+
{"step": 83886080, "val/train_token_count": 83886080, "val/train_batch_count": 40, "val/train_flop_count": 0, "val/train_total_time": 317.24918574804906, "val/train_update_time": 216.73109497816768, "val/loss": 7.519811360669136, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.60113897896372, "val/val_tokens_per_second": 411240.27716842643, "val/loss_avg_len_2048": 7.519811360669136, "val/perplexity_len_2048": 1844.219368930416, "val/loss_avg_len_1024": 7.520807962584495, "val/perplexity_len_1024": 1846.058237643676, "val/loss_avg_len_512": 7.525793135547638, "val/perplexity_len_512": 1855.2841344973567}
|
| 3 |
+
{"step": 125829120, "val/train_token_count": 125829120, "val/train_batch_count": 60, "val/train_flop_count": 0, "val/train_total_time": 523.6280801940011, "val/train_update_time": 323.28231063415296, "val/loss": 7.164581921425462, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.44210483605275, "val/val_tokens_per_second": 411897.9587925008, "val/loss_avg_len_2048": 7.164581921425462, "val/perplexity_len_2048": 1292.8209870442263, "val/loss_avg_len_1024": 7.166963593649864, "val/perplexity_len_1024": 1295.9037324675646, "val/loss_avg_len_512": 7.175143612968922, "val/perplexity_len_512": 1306.5477247144242}
|
| 4 |
+
{"step": 167772160, "val/train_token_count": 167772160, "val/train_batch_count": 80, "val/train_flop_count": 0, "val/train_total_time": 729.8428020050051, "val/train_update_time": 429.83724463917315, "val/loss": 6.865911299175769, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.848096723028, "val/val_tokens_per_second": 406155.409283466, "val/loss_avg_len_2048": 6.865911299175769, "val/perplexity_len_2048": 959.0193952924727, "val/loss_avg_len_1024": 6.87047398582399, "val/perplexity_len_1024": 963.4050979677779, "val/loss_avg_len_512": 6.882724252340198, "val/perplexity_len_512": 975.2796519063501}
|
| 5 |
+
{"step": 209715200, "val/train_token_count": 209715200, "val/train_batch_count": 100, "val/train_flop_count": 0, "val/train_total_time": 937.4650811910396, "val/train_update_time": 536.3882422860479, "val/loss": 6.6218816578798005, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.0134965860052, "val/val_tokens_per_second": 409544.7254439007, "val/loss_avg_len_2048": 6.6218816578798005, "val/perplexity_len_2048": 751.3575656878406, "val/loss_avg_len_1024": 6.628069533909858, "val/perplexity_len_1024": 756.0212875438285, "val/loss_avg_len_512": 6.643743345025182, "val/perplexity_len_512": 767.9643747482285}
|
| 6 |
+
{"step": 251658240, "val/train_token_count": 251658240, "val/train_batch_count": 120, "val/train_flop_count": 0, "val/train_total_time": 1144.6911778100184, "val/train_update_time": 642.9422557381331, "val/loss": 6.436701403411478, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.36018703901209, "val/val_tokens_per_second": 408129.96875023755, "val/loss_avg_len_2048": 6.436701403411478, "val/perplexity_len_2048": 624.3439406192032, "val/loss_avg_len_1024": 6.444007082933933, "val/perplexity_len_1024": 628.9218995499804, "val/loss_avg_len_512": 6.4616794860377915, "val/perplexity_len_512": 640.1352524873843}
|
| 7 |
+
{"step": 293601280, "val/train_token_count": 293601280, "val/train_batch_count": 140, "val/train_flop_count": 0, "val/train_total_time": 1351.811079526029, "val/train_update_time": 749.4968144011218, "val/loss": 6.28673181735389, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.59608248295262, "val/val_tokens_per_second": 411261.15584928676, "val/loss_avg_len_2048": 6.28673181735389, "val/perplexity_len_2048": 537.3941537499013, "val/loss_avg_len_1024": 6.294847987662257, "val/perplexity_len_1024": 541.7734838793829, "val/loss_avg_len_512": 6.313898871052266, "val/perplexity_len_512": 552.1936892494012}
|
| 8 |
+
{"step": 335544320, "val/train_token_count": 335544320, "val/train_batch_count": 160, "val/train_flop_count": 0, "val/train_total_time": 1558.167400816048, "val/train_update_time": 856.0381595880608, "val/loss": 6.1621321680786085, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.55671566096134, "val/val_tokens_per_second": 411423.7771712816, "val/loss_avg_len_2048": 6.1621321680786085, "val/perplexity_len_2048": 474.43857996407974, "val/loss_avg_len_1024": 6.1707096502751115, "val/perplexity_len_1024": 478.52557142759065, "val/loss_avg_len_512": 6.190553567818553, "val/perplexity_len_512": 488.11623688017033}
|
| 9 |
+
{"step": 377487360, "val/train_token_count": 377487360, "val/train_batch_count": 180, "val/train_flop_count": 0, "val/train_total_time": 1764.4956908360473, "val/train_update_time": 962.5948072728934, "val/loss": 6.043532439414506, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.50245448201895, "val/val_tokens_per_second": 411648.13685477345, "val/loss_avg_len_2048": 6.043532439414506, "val/perplexity_len_2048": 421.37890441996467, "val/loss_avg_len_1024": 6.052711034156941, "val/perplexity_len_1024": 425.2643749180878, "val/loss_avg_len_512": 6.073728867790103, "val/perplexity_len_512": 434.2971024683484}
|
| 10 |
+
{"step": 419430400, "val/train_token_count": 419430400, "val/train_batch_count": 200, "val/train_flop_count": 0, "val/train_total_time": 1970.7506705410196, "val/train_update_time": 1069.132912081608, "val/loss": 5.946526206344739, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.66884558799211, "val/val_tokens_per_second": 410960.9152023205, "val/loss_avg_len_2048": 5.946526206344739, "val/perplexity_len_2048": 382.4225718901975, "val/loss_avg_len_1024": 5.955733354584128, "val/perplexity_len_1024": 385.95985234406163, "val/loss_avg_len_512": 5.977101679090039, "val/perplexity_len_512": 394.2959143659733}
|
| 11 |
+
{"step": 461373440, "val/train_token_count": 461373440, "val/train_batch_count": 220, "val/train_flop_count": 0, "val/train_total_time": 2177.6055200890405, "val/train_update_time": 1175.6836349036312, "val/loss": 5.859628180498444, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.70184189802967, "val/val_tokens_per_second": 410824.9077473609, "val/loss_avg_len_2048": 5.859628180498444, "val/perplexity_len_2048": 350.59376218425683, "val/loss_avg_len_1024": 5.869486711973604, "val/perplexity_len_1024": 354.06719516588015, "val/loss_avg_len_512": 5.89188057346195, "val/perplexity_len_512": 362.0855730249886}
|
| 12 |
+
{"step": 503316480, "val/train_token_count": 503316480, "val/train_batch_count": 240, "val/train_flop_count": 0, "val/train_total_time": 2384.0519924180117, "val/train_update_time": 1282.2238651026273, "val/loss": 5.783013271738776, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.6605639460031, "val/val_tokens_per_second": 410995.0654322251, "val/loss_avg_len_2048": 5.783013271738776, "val/perplexity_len_2048": 324.7362361487591, "val/loss_avg_len_1024": 5.79357122720005, "val/perplexity_len_1024": 328.1829499750509, "val/loss_avg_len_512": 5.816968626810331, "val/perplexity_len_512": 335.95211227350995}
|
| 13 |
+
{"step": 545259520, "val/train_token_count": 545259520, "val/train_batch_count": 260, "val/train_flop_count": 0, "val/train_total_time": 2590.4739345350536, "val/train_update_time": 1388.759745098534, "val/loss": 5.716440834625299, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.56407831504475, "val/val_tokens_per_second": 411393.3528354743, "val/loss_avg_len_2048": 5.716440834625299, "val/perplexity_len_2048": 303.8216448390781, "val/loss_avg_len_1024": 5.72740313000246, "val/perplexity_len_1024": 307.17054975726995, "val/loss_avg_len_512": 5.751391485624389, "val/perplexity_len_512": 314.62815638697555}
|
| 14 |
+
{"step": 587202560, "val/train_token_count": 587202560, "val/train_batch_count": 280, "val/train_flop_count": 0, "val/train_total_time": 2796.791148387012, "val/train_update_time": 1495.3029013883206, "val/loss": 5.649823216465559, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.50496849097544, "val/val_tokens_per_second": 411637.73649870406, "val/loss_avg_len_2048": 5.649823216465559, "val/perplexity_len_2048": 284.24121221591713, "val/loss_avg_len_1024": 5.6613804133704875, "val/perplexity_len_1024": 287.5453000942534, "val/loss_avg_len_512": 5.686038017921988, "val/perplexity_len_512": 294.7236147202462}
|
| 15 |
+
{"step": 629145600, "val/train_token_count": 629145600, "val/train_batch_count": 300, "val/train_flop_count": 0, "val/train_total_time": 3003.058338998002, "val/train_update_time": 1601.8528411513544, "val/loss": 5.595519945517543, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.44042880897177, "val/val_tokens_per_second": 411904.9011613321, "val/loss_avg_len_2048": 5.595519945517543, "val/perplexity_len_2048": 269.21759218828305, "val/loss_avg_len_1024": 5.607140162670961, "val/perplexity_len_1024": 272.36420583089864, "val/loss_avg_len_512": 5.632021551703382, "val/perplexity_len_512": 279.2260172957314}
|
| 16 |
+
{"step": 671088640, "val/train_token_count": 671088640, "val/train_batch_count": 320, "val/train_flop_count": 0, "val/train_total_time": 3209.7035325120087, "val/train_update_time": 1708.4027871834696, "val/loss": 5.542625576345867, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.4260749219684, "val/val_tokens_per_second": 411964.3668136979, "val/loss_avg_len_2048": 5.542625576345867, "val/perplexity_len_2048": 255.34755458916246, "val/loss_avg_len_1024": 5.554492764933896, "val/perplexity_len_1024": 258.39586383088005, "val/loss_avg_len_512": 5.579607635878865, "val/perplexity_len_512": 264.9676215997508}
|
| 17 |
+
{"step": 713031680, "val/train_token_count": 713031680, "val/train_batch_count": 340, "val/train_flop_count": 0, "val/train_total_time": 3415.8845459170407, "val/train_update_time": 1814.9587236176012, "val/loss": 5.495897741303175, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.34644473099615, "val/val_tokens_per_second": 404158.23276998143, "val/loss_avg_len_2048": 5.495897741303175, "val/perplexity_len_2048": 243.69019874687106, "val/loss_avg_len_1024": 5.50820064414118, "val/perplexity_len_1024": 246.7068140975722, "val/loss_avg_len_512": 5.5340119562351155, "val/perplexity_len_512": 253.1575333040166}
|
| 18 |
+
{"step": 754974720, "val/train_token_count": 754974720, "val/train_batch_count": 360, "val/train_flop_count": 0, "val/train_total_time": 3623.980893074011, "val/train_update_time": 1921.4990626386134, "val/loss": 5.454193331815151, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.68043452501297, "val/val_tokens_per_second": 402830.6939416551, "val/loss_avg_len_2048": 5.454193331815151, "val/perplexity_len_2048": 233.73624740691568, "val/loss_avg_len_1024": 5.466780889385893, "val/perplexity_len_1024": 236.69701117624166, "val/loss_avg_len_512": 5.492807074442693, "val/perplexity_len_512": 242.9381962186342}
|
| 19 |
+
{"step": 796917760, "val/train_token_count": 796917760, "val/train_batch_count": 380, "val/train_flop_count": 0, "val/train_total_time": 3832.4348147350247, "val/train_update_time": 2028.0436954226461, "val/loss": 5.414499440166541, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.82433698000386, "val/val_tokens_per_second": 406251.12177155656, "val/loss_avg_len_2048": 5.414499440166541, "val/perplexity_len_2048": 224.64007173409922, "val/loss_avg_len_1024": 5.427113480050698, "val/perplexity_len_1024": 227.4916376312745, "val/loss_avg_len_512": 5.453149790130276, "val/perplexity_len_512": 233.4924611121837}
|
| 20 |
+
{"step": 838860800, "val/train_token_count": 838860800, "val/train_batch_count": 400, "val/train_flop_count": 0, "val/train_total_time": 4040.0474286440294, "val/train_update_time": 2134.611642122676, "val/loss": 5.380052010235644, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.62901382899145, "val/val_tokens_per_second": 411125.2177032077, "val/loss_avg_len_2048": 5.380052010235644, "val/perplexity_len_2048": 217.03356309816712, "val/loss_avg_len_1024": 5.393027071399452, "val/perplexity_len_1024": 219.8679351651408, "val/loss_avg_len_512": 5.4193882748374715, "val/perplexity_len_512": 225.74098881334308}
|
| 21 |
+
{"step": 880803840, "val/train_token_count": 880803840, "val/train_batch_count": 420, "val/train_flop_count": 0, "val/train_total_time": 4246.9178847110015, "val/train_update_time": 2241.15703301772, "val/loss": 5.349708102982907, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.11544794699876, "val/val_tokens_per_second": 409127.6705037995, "val/loss_avg_len_2048": 5.349708102982907, "val/perplexity_len_2048": 210.5468309042001, "val/loss_avg_len_1024": 5.362598578461824, "val/perplexity_len_1024": 213.27844776051109, "val/loss_avg_len_512": 5.388854680254846, "val/perplexity_len_512": 218.95247130422538}
|
| 22 |
+
{"step": 922746880, "val/train_token_count": 922746880, "val/train_batch_count": 440, "val/train_flop_count": 0, "val/train_total_time": 4453.811728182016, "val/train_update_time": 2347.708159423666, "val/loss": 5.3179078935331665, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.09129549999489, "val/val_tokens_per_second": 405178.3073647728, "val/loss_avg_len_2048": 5.3179078935331665, "val/perplexity_len_2048": 203.9567361248487, "val/loss_avg_len_1024": 5.331117789819115, "val/perplexity_len_1024": 206.6688574375496, "val/loss_avg_len_512": 5.357798411159706, "val/perplexity_len_512": 212.2571287414731}
|
| 23 |
+
{"step": 964689920, "val/train_token_count": 964689920, "val/train_batch_count": 460, "val/train_flop_count": 0, "val/train_total_time": 4661.668510478048, "val/train_update_time": 2454.2510100168292, "val/loss": 5.291644628655117, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.32744767802069, "val/val_tokens_per_second": 404234.0050857196, "val/loss_avg_len_2048": 5.291644628655117, "val/perplexity_len_2048": 198.66989507746274, "val/loss_avg_len_1024": 5.304906864394906, "val/perplexity_len_1024": 201.32225127081918, "val/loss_avg_len_512": 5.331710340877599, "val/perplexity_len_512": 206.79135557739693}
|
| 24 |
+
{"step": 1006632960, "val/train_token_count": 1006632960, "val/train_batch_count": 480, "val/train_flop_count": 0, "val/train_total_time": 4869.772359199007, "val/train_update_time": 2560.802637038869, "val/loss": 5.267185813411148, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.22516736399848, "val/val_tokens_per_second": 404642.45272829, "val/loss_avg_len_2048": 5.267185813411148, "val/perplexity_len_2048": 193.86960878237306, "val/loss_avg_len_1024": 5.280538284004224, "val/perplexity_len_1024": 196.47560656877008, "val/loss_avg_len_512": 5.307623009739281, "val/perplexity_len_512": 201.8698150611123}
|
| 25 |
+
{"step": 1048576000, "val/train_token_count": 1048576000, "val/train_batch_count": 500, "val/train_flop_count": 0, "val/train_total_time": 5077.749329403043, "val/train_update_time": 2667.341171991662, "val/loss": 5.244305751117928, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.12790367001435, "val/val_tokens_per_second": 405031.6333428074, "val/loss_avg_len_2048": 5.244305751117928, "val/perplexity_len_2048": 189.48422034685896, "val/loss_avg_len_1024": 5.257994546739722, "val/perplexity_len_1024": 192.095865469511, "val/loss_avg_len_512": 5.285545470022318, "val/perplexity_len_512": 197.46186360448596}
|
| 26 |
+
{"step": 1090519040, "val/train_token_count": 1090519040, "val/train_batch_count": 520, "val/train_flop_count": 0, "val/train_total_time": 5286.093386778026, "val/train_update_time": 2773.886181908485, "val/loss": 5.224113088942878, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.72626927896636, "val/val_tokens_per_second": 402649.1907186179, "val/loss_avg_len_2048": 5.224113088942878, "val/perplexity_len_2048": 185.69640127735994, "val/loss_avg_len_1024": 5.2377807637095275, "val/perplexity_len_1024": 188.25186312502768, "val/loss_avg_len_512": 5.265193086130079, "val/perplexity_len_512": 193.48366419285284}
|
| 27 |
+
{"step": 1132462080, "val/train_token_count": 1132462080, "val/train_batch_count": 540, "val/train_flop_count": 0, "val/train_total_time": 5494.618768353015, "val/train_update_time": 2880.4332658784697, "val/loss": 5.204351608313579, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.6204348889878, "val/val_tokens_per_second": 403068.53680310975, "val/loss_avg_len_2048": 5.204351608313579, "val/perplexity_len_2048": 182.0627864926695, "val/loss_avg_len_1024": 5.218173131045303, "val/perplexity_len_1024": 184.59664196761443, "val/loss_avg_len_512": 5.245799912956764, "val/perplexity_len_512": 189.76755205696549}
|
| 28 |
+
{"step": 1174405120, "val/train_token_count": 1174405120, "val/train_batch_count": 560, "val/train_flop_count": 0, "val/train_total_time": 5703.016606429999, "val/train_update_time": 2986.9915024373913, "val/loss": 5.187859910127178, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.53107214701595, "val/val_tokens_per_second": 407436.2197202113, "val/loss_avg_len_2048": 5.187859910127178, "val/perplexity_len_2048": 179.08488478780052, "val/loss_avg_len_1024": 5.201843161700579, "val/perplexity_len_1024": 181.6066640311323, "val/loss_avg_len_512": 5.229689651350794, "val/perplexity_len_512": 186.7348416205781}
|
| 29 |
+
{"step": 1216348160, "val/train_token_count": 1216348160, "val/train_batch_count": 580, "val/train_flop_count": 0, "val/train_total_time": 5910.318671693036, "val/train_update_time": 3093.5336661074543, "val/loss": 5.1715570674947635, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.04047837899998, "val/val_tokens_per_second": 409434.2676453867, "val/loss_avg_len_2048": 5.1715570674947635, "val/perplexity_len_2048": 176.18896211925417, "val/loss_avg_len_1024": 5.185615149655531, "val/perplexity_len_1024": 178.68333297978904, "val/loss_avg_len_512": 5.213619250194659, "val/perplexity_len_512": 183.75792201945603}
|
| 30 |
+
{"step": 1258291200, "val/train_token_count": 1258291200, "val/train_batch_count": 600, "val/train_flop_count": 0, "val/train_total_time": 6117.130997166038, "val/train_update_time": 3200.0810677845147, "val/loss": 5.157462475304946, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.89286627602996, "val/val_tokens_per_second": 410039.2903615046, "val/loss_avg_len_2048": 5.157462475304946, "val/perplexity_len_2048": 173.72306924924297, "val/loss_avg_len_1024": 5.171683135021537, "val/perplexity_len_1024": 176.2111752260996, "val/loss_avg_len_512": 5.199810537778202, "val/perplexity_len_512": 181.23790088671788}
|
| 31 |
+
{"step": 1300234240, "val/train_token_count": 1300234240, "val/train_batch_count": 620, "val/train_flop_count": 0, "val/train_total_time": 6324.271360302053, "val/train_update_time": 3306.647373120475, "val/loss": 5.143889664166529, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.14473170100246, "val/val_tokens_per_second": 409008.0357126763, "val/loss_avg_len_2048": 5.143889664166529, "val/perplexity_len_2048": 171.38108842507802, "val/loss_avg_len_1024": 5.15810240709968, "val/perplexity_len_1024": 173.83427574322482, "val/loss_avg_len_512": 5.18635319549219, "val/perplexity_len_512": 178.81525814719697}
|
| 32 |
+
{"step": 1342177280, "val/train_token_count": 1342177280, "val/train_batch_count": 640, "val/train_flop_count": 0, "val/train_total_time": 6531.185974933032, "val/train_update_time": 3413.19108713849, "val/loss": 5.131008138038125, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.94807949004462, "val/val_tokens_per_second": 409812.7768836203, "val/loss_avg_len_2048": 5.131008138038125, "val/perplexity_len_2048": 169.1875965491763, "val/loss_avg_len_1024": 5.145431026424136, "val/perplexity_len_1024": 171.6454524544731, "val/loss_avg_len_512": 5.173947728090605, "val/perplexity_len_512": 176.61067401256858}
|
| 33 |
+
{"step": 1384120320, "val/train_token_count": 1384120320, "val/train_batch_count": 660, "val/train_flop_count": 0, "val/train_total_time": 6737.915407613036, "val/train_update_time": 3519.7321525084553, "val/loss": 5.120392005993269, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.73520365200238, "val/val_tokens_per_second": 410687.4854631898, "val/loss_avg_len_2048": 5.120392005993269, "val/perplexity_len_2048": 167.40097894766583, "val/loss_avg_len_1024": 5.13483073173333, "val/perplexity_len_1024": 169.83556966900616, "val/loss_avg_len_512": 5.163387195022637, "val/perplexity_len_512": 174.75538481351157}
|
| 34 |
+
{"step": 1426063360, "val/train_token_count": 1426063360, "val/train_batch_count": 680, "val/train_flop_count": 0, "val/train_total_time": 6944.4543808570015, "val/train_update_time": 3626.2884129853337, "val/loss": 5.110702060040148, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.9236697970191, "val/val_tokens_per_second": 409912.8873389507, "val/loss_avg_len_2048": 5.110702060040148, "val/perplexity_len_2048": 165.78670624776979, "val/loss_avg_len_1024": 5.125157545317412, "val/perplexity_len_1024": 168.20063878756756, "val/loss_avg_len_512": 5.153955277375667, "val/perplexity_len_512": 173.11485524738933}
|
| 35 |
+
{"step": 1468006400, "val/train_token_count": 1468006400, "val/train_batch_count": 700, "val/train_flop_count": 0, "val/train_total_time": 7151.144483595039, "val/train_update_time": 3732.833151328552, "val/loss": 5.1014151430890955, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.96349205297884, "val/val_tokens_per_second": 409749.59116366145, "val/loss_avg_len_2048": 5.1014151430890955, "val/perplexity_len_2048": 164.25418608357114, "val/loss_avg_len_1024": 5.116010741177417, "val/perplexity_len_1024": 166.6691552558016, "val/loss_avg_len_512": 5.1449303319406585, "val/perplexity_len_512": 171.55953203505013}
|
| 36 |
+
{"step": 1509949440, "val/train_token_count": 1509949440, "val/train_batch_count": 720, "val/train_flop_count": 0, "val/train_total_time": 7358.337214609026, "val/train_update_time": 3839.3859579174896, "val/loss": 5.093367775315, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.89132478396641, "val/val_tokens_per_second": 410045.61796115554, "val/loss_avg_len_2048": 5.093367775315, "val/perplexity_len_2048": 162.93767656263796, "val/loss_avg_len_1024": 5.108063309450646, "val/perplexity_len_1024": 165.34981315838286, "val/loss_avg_len_512": 5.137162590507989, "val/perplexity_len_512": 170.23206433751022}
|
| 37 |
+
{"step": 1551892480, "val/train_token_count": 1551892480, "val/train_batch_count": 740, "val/train_flop_count": 0, "val/train_total_time": 7565.03086849401, "val/train_update_time": 3945.9412919793394, "val/loss": 5.086384535997531, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.77674050303176, "val/val_tokens_per_second": 410516.5171110737, "val/loss_avg_len_2048": 5.086384535997531, "val/perplexity_len_2048": 161.80380742097068, "val/loss_avg_len_1024": 5.101098106556922, "val/perplexity_len_1024": 164.20211975991967, "val/loss_avg_len_512": 5.1302048265572875, "val/perplexity_len_512": 169.05174078500627}
|
| 38 |
+
{"step": 1593835520, "val/train_token_count": 1593835520, "val/train_batch_count": 760, "val/train_flop_count": 0, "val/train_total_time": 7771.636799781001, "val/train_update_time": 4052.503600837372, "val/loss": 5.080432867527884, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.85730355099076, "val/val_tokens_per_second": 410185.31988583424, "val/loss_avg_len_2048": 5.080432867527884, "val/perplexity_len_2048": 160.84366486138146, "val/loss_avg_len_1024": 5.09520770714046, "val/perplexity_len_1024": 163.2377467490014, "val/loss_avg_len_512": 5.124402106901945, "val/perplexity_len_512": 168.07362154641356}
|
| 39 |
+
{"step": 1635778560, "val/train_token_count": 1635778560, "val/train_batch_count": 780, "val/train_flop_count": 0, "val/train_total_time": 7978.290488699044, "val/train_update_time": 4159.058140857378, "val/loss": 5.0751120198699065, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.50360889302101, "val/val_tokens_per_second": 403532.4501926773, "val/loss_avg_len_2048": 5.0751120198699065, "val/perplexity_len_2048": 159.9901130472598, "val/loss_avg_len_1024": 5.08987260026416, "val/perplexity_len_1024": 162.36917494575445, "val/loss_avg_len_512": 5.11916280782083, "val/perplexity_len_512": 167.19533638390328}
|
| 40 |
+
{"step": 1677721600, "val/train_token_count": 1677721600, "val/train_batch_count": 800, "val/train_flop_count": 0, "val/train_total_time": 8186.5969784220215, "val/train_update_time": 4265.60545003548, "val/loss": 5.070432775841683, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.58954598399578, "val/val_tokens_per_second": 403191.0921863236, "val/loss_avg_len_2048": 5.070432775841683, "val/perplexity_len_2048": 159.24322905521015, "val/loss_avg_len_1024": 5.085260728531238, "val/perplexity_len_1024": 161.6220732303632, "val/loss_avg_len_512": 5.1146308313941, "val/perplexity_len_512": 166.43932546654347}
|
| 41 |
+
{"step": 1719664640, "val/train_token_count": 1719664640, "val/train_batch_count": 820, "val/train_flop_count": 0, "val/train_total_time": 8395.40680388402, "val/train_update_time": 4372.159924876352, "val/loss": 5.0664675774028645, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 101.43134209199343, "val/val_tokens_per_second": 403819.95500810014, "val/loss_avg_len_2048": 5.0664675774028645, "val/perplexity_len_2048": 158.6130482735774, "val/loss_avg_len_1024": 5.0813700341027355, "val/perplexity_len_1024": 160.99447282291885, "val/loss_avg_len_512": 5.110828997220903, "val/perplexity_len_512": 165.8077520805886}
|
| 42 |
+
{"step": 1761607680, "val/train_token_count": 1761607680, "val/train_batch_count": 840, "val/train_flop_count": 0, "val/train_total_time": 8603.637448858004, "val/train_update_time": 4478.715910169412, "val/loss": 5.063192765613369, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.8011323119863, "val/val_tokens_per_second": 410416.1851786989, "val/loss_avg_len_2048": 5.063192765613369, "val/perplexity_len_2048": 158.09446997973149, "val/loss_avg_len_1024": 5.078104476989369, "val/perplexity_len_1024": 160.46959365486623, "val/loss_avg_len_512": 5.107591516341537, "val/perplexity_len_512": 165.27182065555164}
|
| 43 |
+
{"step": 1803550720, "val/train_token_count": 1803550720, "val/train_batch_count": 860, "val/train_flop_count": 0, "val/train_total_time": 8810.230298971, "val/train_update_time": 4585.266904477379, "val/loss": 5.060545605122825, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.59589945495827, "val/val_tokens_per_second": 411261.91162643145, "val/loss_avg_len_2048": 5.060545605122825, "val/perplexity_len_2048": 157.67652197681062, "val/loss_avg_len_1024": 5.075487850701, "val/perplexity_len_1024": 160.0502535650356, "val/loss_avg_len_512": 5.104997573237831, "val/perplexity_len_512": 164.84367049477257}
|
| 44 |
+
{"step": 1845493760, "val/train_token_count": 1845493760, "val/train_batch_count": 880, "val/train_flop_count": 0, "val/train_total_time": 9016.607044072007, "val/train_update_time": 4691.8174923404, "val/loss": 5.058485259354685, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.55429247097345, "val/val_tokens_per_second": 411433.79138516303, "val/loss_avg_len_2048": 5.058485259354685, "val/perplexity_len_2048": 157.35198826265747, "val/loss_avg_len_1024": 5.073454285918526, "val/perplexity_len_1024": 159.72511171651485, "val/loss_avg_len_512": 5.103029847818229, "val/perplexity_len_512": 164.51962233753557}
|
| 45 |
+
{"step": 1887436800, "val/train_token_count": 1887436800, "val/train_batch_count": 900, "val/train_flop_count": 0, "val/train_total_time": 9222.955616571999, "val/train_update_time": 4798.3678903255495, "val/loss": 5.056988405886034, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.03087017301004, "val/val_tokens_per_second": 409473.594792857, "val/loss_avg_len_2048": 5.056988405886034, "val/perplexity_len_2048": 157.1166315844018, "val/loss_avg_len_1024": 5.071977700420563, "val/perplexity_len_1024": 159.48943797193047, "val/loss_avg_len_512": 5.1015830969281035, "val/perplexity_len_512": 164.2817755215082}
|
| 46 |
+
{"step": 1929379840, "val/train_token_count": 1929379840, "val/train_batch_count": 920, "val/train_flop_count": 0, "val/train_total_time": 9430.23481100105, "val/train_update_time": 4904.91280556639, "val/loss": 5.055763588758994, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.07011089101434, "val/val_tokens_per_second": 409313.02698973974, "val/loss_avg_len_2048": 5.055763588758994, "val/perplexity_len_2048": 156.92431024637185, "val/loss_avg_len_1024": 5.0707623975899185, "val/perplexity_len_1024": 159.29572773864462, "val/loss_avg_len_512": 5.100373116077785, "val/perplexity_len_512": 164.08311792924914}
|
| 47 |
+
{"step": 1971322880, "val/train_token_count": 1971322880, "val/train_batch_count": 940, "val/train_flop_count": 0, "val/train_total_time": 9637.072636537021, "val/train_update_time": 5011.441395019239, "val/loss": 5.055077503644489, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 100.58326662296895, "val/val_tokens_per_second": 407224.79369790596, "val/loss_avg_len_2048": 5.055077503644489, "val/perplexity_len_2048": 156.81668373770228, "val/loss_avg_len_1024": 5.070088747809251, "val/perplexity_len_1024": 159.1884543429987, "val/loss_avg_len_512": 5.099732340347279, "val/perplexity_len_512": 163.9780111280343}
|
| 48 |
+
{"step": 2013265920, "val/train_token_count": 2013265920, "val/train_batch_count": 960, "val/train_flop_count": 0, "val/train_total_time": 9844.444105764036, "val/train_update_time": 5117.988080174255, "val/loss": 5.054681106794774, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.61741263099248, "val/val_tokens_per_second": 411173.09633132076, "val/loss_avg_len_2048": 5.054681106794774, "val/perplexity_len_2048": 156.75453441699727, "val/loss_avg_len_1024": 5.069690365717549, "val/perplexity_len_1024": 159.12504914416917, "val/loss_avg_len_512": 5.0993228254112655, "val/perplexity_len_512": 163.9108734311827}
|
| 49 |
+
{"step": 2055208960, "val/train_token_count": 2055208960, "val/train_batch_count": 980, "val/train_flop_count": 0, "val/train_total_time": 10050.842236216005, "val/train_update_time": 5224.530718925176, "val/loss": 5.054502099541319, "val/val_token_count": 40960000, "val/val_seq_count": 20000, "val/val_time": 99.76580651698168, "val/val_tokens_per_second": 410561.5082962114, "val/loss_avg_len_2048": 5.054502099541319, "val/perplexity_len_2048": 156.72647672966426, "val/loss_avg_len_1024": 5.069511245464872, "val/perplexity_len_1024": 159.09654917769598, "val/loss_avg_len_512": 5.099155388283602, "val/perplexity_len_512": 163.88343096285075}
|
metrics/npz/train_eval/step-000000104857600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5ca2b6fd9a44859a14d30a8b8b835115a070dce0ccab094a1711a916bcdee76b
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:efc94aaa1ae3287e6e0ecc7f4ae4f5d4c9f8a5fbf7ba1be95807a3ec73607dd6
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000314572800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bf8679ea18aa02fece3c2aee47599c098b3e7ec97a05ca9b1ca46dac8b712223
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d895609bd6b3c4f3a17d8069f4131872034adb440ed9810322ce88838c1c3990
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000524288000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f8c1ca16cfdc3493df8af18fdfaaae4386ecf358be656b38d39920e6bd972e0
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000629145600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92cdfac9bbbe35185425c7e209695a6e67e906a583c716b17cd311222fd408d2
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000734003200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb65bb2177a5d1c53c6726ab856a944b566fd208c9b92e1b80dd0e03564f2eab
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000838860800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aa86ff00d18908412d7f09cd7617e7a41dbd33dddb13d3e9c91630b332d1496
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000000943718400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c087c1312c95a2beb753576a44d0c77fcf698d9d9bb1643ae3bb43ca6d99c908
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001048576000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94cee7bf23eb0b037070c3e131b17f61ff9256e5070bae1a58af07dccc4c1eb7
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001153433600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f0592dcf61d27ffb9891c62209eddcec508944613cde77346812c2cee129814e
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001258291200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d7fd6cff024296d3d4e37a5568be8739860f647d958ef0cbee26959e9c6744c
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001363148800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7718a521eb87cfcd6dbb500a3c0b4ae5e36526ee28ab0ad519ce36d74065d1dc
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001468006400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a377760799d38ceef201c02ea112f48d12c48bfd5c2ef9194a67281b3c4b598
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001572864000.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80a518d1389fea79c6b204dd78b21fe4643e3e61b65d2104807b653b5abca991
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001677721600.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42b19cc212c4ae90e7d5a9f69978c1bda5a50ad2c4f18d814663d8ea45b0f899
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001782579200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:edf561f29428519e6f1e81f52f81e0cad663038ec83149bc48754f8b9defa9ea
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001887436800.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3939f70caf466c0a57c7e8ebbee8b34400feae0428d57911a40385f41221b52f
|
| 3 |
size 20540
|
metrics/npz/train_eval/step-000001992294400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 20540
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bd0a5bf73c23210012dde63862c968e6f053177e638cb68e029b32a1162cacc
|
| 3 |
size 20540
|
metrics/npz/val/step-000000041943040.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1f9ba5fd2709899a0902dc4d605b1b4b400b141e5c98b4d02e5a00bc815f29ec
|
| 3 |
size 21142
|
metrics/npz/val/step-000000083886080.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e60868b7de6855b5a73d630b2f82011a16b50d3c9b1e6e4a78a7170a1e7405e
|
| 3 |
size 21142
|
metrics/npz/val/step-000000125829120.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2b10b365ab893feeadacb3cefb6946b4e7714b116522d65e756a83c378920f3
|
| 3 |
size 21142
|
metrics/npz/val/step-000000167772160.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8cdfdd1716cb177612d3a62862afba6f66eca558bd607f1a67affa956a315f00
|
| 3 |
size 21142
|
metrics/npz/val/step-000000209715200.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21905eca67d4de40c9080cb2c178d46bc017660a757e9f25c657a69d443ba3ec
|
| 3 |
size 21142
|
metrics/npz/val/step-000000251658240.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:352b137c1b691c16a923255fa0b63b761bd51ac169370d8da45706bbb7a04c4a
|
| 3 |
size 21142
|
metrics/npz/val/step-000000293601280.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf3df61f67c3b510a24a76da42144e41631b703d6f3cf777d21b58e3d925f29b
|
| 3 |
size 21142
|
metrics/npz/val/step-000000335544320.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ae4ee9a3e3af0b555a0c72a2183a19a98ad35f2da4d7e1e5efc7c24cc10e50e
|
| 3 |
size 21142
|
metrics/npz/val/step-000000377487360.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17d945dd8424f0cd27d4b6d0d068c74c6c5b58dd33fc856fc2bdd92e38e5c7cf
|
| 3 |
size 21142
|
metrics/npz/val/step-000000419430400.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da308e3e186476c46574c1dd01a3562d86f10052ab2ca4e917939c116c891953
|
| 3 |
size 21142
|
metrics/npz/val/step-000000461373440.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c7512604a0d29b4dd95fdd2c4df34b03a293d3f1bdb0ce93fcff6a68887799a
|
| 3 |
size 21142
|
metrics/npz/val/step-000000503316480.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 21142
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e2b21b16830d02347033e6ef4c3d740739cca8b48c7f787db45dd0e882dc6431
|
| 3 |
size 21142
|