CarlOwOs commited on
Commit
1c813eb
·
verified ·
1 Parent(s): 21e1509

Add files using upload-large-folder tool

Browse files
Files changed (34) hide show
  1. .gitattributes +1 -0
  2. checkpoint/step-0/.metadata +0 -0
  3. checkpoint/step-144/__5_0.distcp +3 -0
  4. logs/none_4cvjdbqa/attempt_0/0/stderr.log +621 -0
  5. logs/none_4cvjdbqa/attempt_0/0/stdout.log +0 -0
  6. logs/none_4cvjdbqa/attempt_0/1/stderr.log +620 -0
  7. logs/none_4cvjdbqa/attempt_0/1/stdout.log +0 -0
  8. logs/none_4cvjdbqa/attempt_0/2/stderr.log +620 -0
  9. logs/none_4cvjdbqa/attempt_0/2/stdout.log +0 -0
  10. logs/none_4cvjdbqa/attempt_0/3/stderr.log +620 -0
  11. logs/none_4cvjdbqa/attempt_0/3/stdout.log +0 -0
  12. logs/none_4cvjdbqa/attempt_0/4/stderr.log +620 -0
  13. logs/none_4cvjdbqa/attempt_0/4/stdout.log +0 -0
  14. logs/none_4cvjdbqa/attempt_0/5/stderr.log +620 -0
  15. logs/none_4cvjdbqa/attempt_0/5/stdout.log +0 -0
  16. logs/none_4cvjdbqa/attempt_0/6/stderr.log +620 -0
  17. logs/none_4cvjdbqa/attempt_0/6/stdout.log +0 -0
  18. logs/none_4cvjdbqa/attempt_0/7/stderr.log +620 -0
  19. logs/none_4cvjdbqa/attempt_0/7/stdout.log +0 -0
  20. logs/none_rci5peh0/attempt_0/0/stderr.log +333 -0
  21. logs/none_rci5peh0/attempt_0/1/stderr.log +332 -0
  22. logs/none_rci5peh0/attempt_0/1/stdout.log +0 -0
  23. logs/none_rci5peh0/attempt_0/2/stderr.log +332 -0
  24. logs/none_rci5peh0/attempt_0/2/stdout.log +0 -0
  25. logs/none_rci5peh0/attempt_0/3/stderr.log +332 -0
  26. logs/none_rci5peh0/attempt_0/3/stdout.log +0 -0
  27. logs/none_rci5peh0/attempt_0/4/stderr.log +332 -0
  28. logs/none_rci5peh0/attempt_0/4/stdout.log +0 -0
  29. logs/none_rci5peh0/attempt_0/5/stderr.log +332 -0
  30. logs/none_rci5peh0/attempt_0/5/stdout.log +0 -0
  31. logs/none_rci5peh0/attempt_0/6/stderr.log +332 -0
  32. logs/none_rci5peh0/attempt_0/6/stdout.log +0 -0
  33. logs/none_rci5peh0/attempt_0/7/stderr.log +332 -0
  34. logs/none_rci5peh0/attempt_0/7/stdout.log +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint/step-144/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoint/step-0/.metadata ADDED
Binary file (92.5 kB). View file
 
checkpoint/step-144/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:827419ab5b73e2042a48d1d0919abe6b8333a781c34797c15ad4cdff61a7e322
3
+ size 11004728080
logs/none_4cvjdbqa/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,615 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:30,017 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:30,025 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:30,028 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:30,029 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:30,029 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,053 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:31,008 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:31,010 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:31,010 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:31,012 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:31,014 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,167 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,229 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,229 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,230 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,359 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,736 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,786 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,786 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,481 - root - INFO - [GC] GC collection for checkpoint loading. 0.03 seconds.
273
+ [titan] 2026-01-06 20:24:20,481 - root - INFO - Finished loading the checkpoint in 48.69 seconds.
274
+ [titan] 2026-01-06 20:24:25,312 - root - ERROR - Failed to create WandB logger: No API key configured. Use `wandb login` to log in.
275
+ [titan] 2026-01-06 20:24:25,344 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
276
+ [titan] 2026-01-06 20:24:25,346 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
277
+ [titan] 2026-01-06 20:24:27,453 - root - INFO - Mixed precision training is handled by fully_shard
278
+ [titan] 2026-01-06 20:24:27,454 - root - INFO - ***** Running training *****
279
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Training starts at step 2
280
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Number of tokens per sequence = 2,048
281
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Gradient Accumulation steps = 16
282
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Instantaneous batch size (per device) = 2
283
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
284
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
285
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
286
+ [titan] 2026-01-06 20:24:27,454 - root - INFO -  Number of parameters = 14,409,815,040 
287
+ [titan] 2026-01-06 20:24:27,454 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
288
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
289
+ torch._dynamo.utils.warn_once(msg)
290
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
291
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
292
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
293
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 159 tflops: 14.55 mfu: 4.66%
295
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:13:56<14 days, 20:49:44]
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
297
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:38<10 days, 9:32:35]
298
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
299
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:19<8 days, 3:54:22]
300
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.88 mfu: 46.44%
301
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:00<6 days, 19:43:31]
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
303
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:42<5 days, 22:16:34]
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
305
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:23<5 days, 6:57:26]
306
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
307
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:05<4 days, 19:27:46]
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
309
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:46<4 days, 10:31:21]
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
311
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:28<4 days, 3:22:06]
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
313
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:09<3 days, 21:30:46]
314
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
315
+ [titan] 2026-01-06 20:38:11,961 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:51<3 days, 16:37:51]
316
+ [titan] 2026-01-06 20:38:53,462 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
317
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:32<3 days, 12:29:56]
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
319
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:14<3 days, 8:57:19]
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
321
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:22:55<3 days, 5:52:59]
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:37<3 days, 3:11:38]
324
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
325
+ [titan] 2026-01-06 20:41:27,039 - root - INFO - [GC] GC collection invoked by checkpointer. 0.59 seconds.
326
+ [titan] 2026-01-06 20:41:27,039 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.07 seconds.
327
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.95GiB(90.78%) tps: 923 tflops: 84.44 mfu: 27.06%
328
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:48<3 days, 2:17:33]
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.71 mfu: 46.38%
330
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:29<3 days, 0:05:42]
331
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.48 mfu: 46.31%
332
+ [titan] 2026-01-06 20:43:31,925 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:11<2 days, 22:07:50]
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
334
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:52<2 days, 20:21:45]
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - step: 21 loss: 9.6201 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.43 mfu: 46.29%
336
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:34<2 days, 18:45:40]
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.40 mfu: 46.28%
338
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:15<2 days, 17:18:17]
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.33 mfu: 46.26%
340
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:28:57<2 days, 15:58:29]
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
342
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:38<2 days, 14:45:14]
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
344
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:20<2 days, 13:37:49]
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.38 mfu: 46.27%
346
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:01<2 days, 12:35:31]
347
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.31 mfu: 46.25%
348
+ [titan] 2026-01-06 20:49:04,179 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:43<2 days, 11:37:49]
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.32 mfu: 46.26%
350
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:25<2 days, 10:44:11]
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.31 mfu: 46.25%
352
+ [titan] 2026-01-06 20:50:27,275 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:06<2 days, 9:54:12]
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
354
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:48<2 days, 9:07:29]
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.29 mfu: 46.25%
356
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:29<2 days, 8:23:47]
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.26%
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:11<2 days, 7:42:43]
359
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
360
+ [titan] 2026-01-06 20:52:52,181 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
361
+ [titan] 2026-01-06 20:52:52,181 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.27 seconds.
362
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - step: 33 loss: 8.1782 memory: 71.95GiB(90.78%) tps: 1,063 tflops: 97.21 mfu: 31.16%
363
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:12<2 days, 7:35:02]
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.95GiB(90.78%) tps: 1,580 tflops: 144.59 mfu: 46.34%
365
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:54<2 days, 6:57:38]
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.95GiB(90.78%) tps: 1,580 tflops: 144.53 mfu: 46.32%
367
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:35<2 days, 6:22:22]
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.40 mfu: 46.28%
369
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:17<2 days, 5:49:03]
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
371
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:38:58<2 days, 5:17:32]
372
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - step: 38 loss: 8.0173 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
373
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:40<2 days, 4:47:38]
374
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
375
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:21<2 days, 4:19:13]
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
377
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:03<2 days, 3:52:12]
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
379
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:45<2 days, 3:26:28]
380
+ [titan] 2026-01-06 20:59:47,255 - root - INFO - step: 42 loss: 7.9890 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.43 mfu: 46.29%
381
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:26<2 days, 3:01:54]
382
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
383
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:08<2 days, 2:38:28]
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.26%
385
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:49<2 days, 2:16:05]
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
387
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:31<2 days, 1:54:39]
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.26%
389
+ [titan] 2026-01-06 21:02:33,409 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:12<2 days, 1:34:07]
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.30 mfu: 46.25%
391
+ [titan] 2026-01-06 21:03:14,962 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:54<2 days, 1:14:27]
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:35<2 days, 0:55:34]
394
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
395
+ [titan] 2026-01-06 21:04:16,545 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
396
+ [titan] 2026-01-06 21:04:16,545 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.05 seconds.
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.95GiB(90.78%) tps: 1,066 tflops: 97.54 mfu: 31.26%
398
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:37<2 days, 0:57:54]
399
+ [titan] 2026-01-06 21:04:57,982 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
400
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.65 mfu: 46.36%
401
+ [titan] 2026-01-06 21:05:39,422 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:18<2 days, 0:39:57]
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.95GiB(90.78%) tps: 1,580 tflops: 144.59 mfu: 46.34%
403
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:00<2 days, 0:22:41]
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.45 mfu: 46.30%
405
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:41<2 days, 0:06:06]
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.38 mfu: 46.28%
407
+ [titan] 2026-01-06 21:07:43,931 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:23<1 day, 23:50:08]
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.38 mfu: 46.28%
409
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:04<1 day, 23:34:44]
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
411
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:46<1 day, 23:19:53]
412
+ [titan] 2026-01-06 21:09:48,535 - root - INFO - step: 56 loss: 7.7100 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
413
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:27<1 day, 23:05:32]
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.32 mfu: 46.26%
415
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:09<1 day, 22:51:40]
416
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.33 mfu: 46.26%
417
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:50<1 day, 22:38:15]
418
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
419
+ [titan] 2026-01-06 21:11:53,170 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:32<1 day, 22:25:16]
420
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
421
+ [titan] 2026-01-06 21:12:34,709 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:13<1 day, 22:12:41]
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
423
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:55:55<1 day, 22:00:30]
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.31 mfu: 46.25%
425
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:37<1 day, 21:48:42]
426
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.32 mfu: 46.26%
427
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:18<1 day, 21:37:14]
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:00<1 day, 21:26:06]
430
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
431
+ [titan] 2026-01-06 21:15:41,914 - root - INFO - [GC] GC collection invoked by checkpointer. 0.15 seconds.
432
+ [titan] 2026-01-06 21:15:41,915 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.04 seconds.
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.95GiB(90.78%) tps: 1,051 tflops: 96.13 mfu: 30.81%
434
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:02<1 day, 21:31:22]
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.69 mfu: 46.38%
436
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:43<1 day, 21:20:33]
437
+ [titan] 2026-01-06 21:17:46,152 - root - INFO - step: 67 loss: 7.6772 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.61 mfu: 46.35%
438
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:25<1 day, 21:10:02]
439
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.49 mfu: 46.31%
440
+ [titan] 2026-01-06 21:18:27,651 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:06<1 day, 20:59:51]
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.43 mfu: 46.29%
442
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:48<1 day, 20:49:57]
443
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.41 mfu: 46.29%
444
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:29<1 day, 20:40:19]
445
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
446
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:11<1 day, 20:30:57]
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
448
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:53<1 day, 20:21:49]
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
450
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:34<1 day, 20:12:55]
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
452
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:16<1 day, 20:04:15]
453
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.26%
454
+ [titan] 2026-01-06 21:23:18,374 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:05:57<1 day, 19:55:47]
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - step: 76 loss: 7.6091 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
456
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:39<1 day, 19:47:31]
457
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
458
+ [titan] 2026-01-06 21:24:41,442 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:20<1 day, 19:39:27]
459
+ [titan] 2026-01-06 21:25:22,982 - root - INFO - step: 78 loss: 7.6119 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
460
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:02<1 day, 19:31:35]
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
462
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:43<1 day, 19:23:53]
463
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:25<1 day, 19:16:22]
465
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
466
+ [titan] 2026-01-06 21:27:08,297 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
467
+ [titan] 2026-01-06 21:27:08,298 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.25 seconds.
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.95GiB(90.78%) tps: 1,030 tflops: 94.22 mfu: 30.20%
469
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:28<1 day, 19:22:37]
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.76 mfu: 46.40%
471
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:10<1 day, 19:15:12]
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.67 mfu: 46.37%
473
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:51<1 day, 19:07:57]
474
+ [titan] 2026-01-06 21:29:54,023 - root - INFO - step: 84 loss: 7.5734 memory: 71.95GiB(90.78%) tps: 1,580 tflops: 144.59 mfu: 46.34%
475
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:33<1 day, 19:00:52]
476
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.50 mfu: 46.31%
477
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:14<1 day, 18:53:57]
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.45 mfu: 46.30%
479
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:13:56<1 day, 18:47:12]
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.44 mfu: 46.29%
481
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:37<1 day, 18:40:35]
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
483
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:19<1 day, 18:34:06]
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
485
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:00<1 day, 18:27:45]
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.40 mfu: 46.28%
487
+ [titan] 2026-01-06 21:34:03,123 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:42<1 day, 18:21:32]
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
489
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:23<1 day, 18:15:26]
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
491
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:05<1 day, 18:09:27]
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.38 mfu: 46.28%
493
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:46<1 day, 18:03:35]
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.95GiB(90.78%) tps: 1,580 tflops: 144.52 mfu: 46.32%
495
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:28<1 day, 17:57:48]
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.45 mfu: 46.30%
497
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:09<1 day, 17:52:09]
498
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:51<1 day, 17:46:36]
500
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
501
+ [titan] 2026-01-06 21:38:32,910 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
502
+ [titan] 2026-01-06 21:38:32,910 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.66 seconds.
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.95GiB(90.78%) tps: 1,057 tflops: 96.68 mfu: 30.99%
504
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:53<1 day, 17:51:38]
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.76 mfu: 46.40%
506
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:34<1 day, 17:46:07]
507
+ [titan] 2026-01-06 21:40:37,179 - root - INFO - step: 99 loss: 7.5091 memory: 71.95GiB(90.78%) tps: 1,580 tflops: 144.52 mfu: 46.32%
508
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:16<1 day, 17:40:44]
509
+ [titan] 2026-01-06 21:40:37,205 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
511
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:23:57<1 day, 17:35:28]
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.41 mfu: 46.28%
513
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:39<1 day, 17:30:17]
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.45 mfu: 46.30%
515
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:20<1 day, 17:25:11]
516
+ [titan] 2026-01-06 21:43:23,264 - root - INFO - step: 103 loss: 7.4788 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.40 mfu: 46.28%
517
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:02<1 day, 17:20:10]
518
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.41 mfu: 46.29%
519
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:44<1 day, 17:15:15]
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
521
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:25<1 day, 17:10:24]
522
+ [titan] 2026-01-06 21:45:27,837 - root - INFO - step: 106 loss: 7.4770 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.42 mfu: 46.29%
523
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:07<1 day, 17:05:38]
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
525
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:48<1 day, 17:00:57]
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
527
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:30<1 day, 16:56:20]
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
529
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:11<1 day, 16:51:48]
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
531
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:53<1 day, 16:47:20]
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
533
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:34<1 day, 16:42:56]
534
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.32 mfu: 46.26%
535
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:16<1 day, 16:38:36]
536
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
537
+ [titan] 2026-01-06 21:49:59,574 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
538
+ [titan] 2026-01-06 21:49:59,574 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
539
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.95GiB(90.78%) tps: 1,027 tflops: 93.93 mfu: 30.11%
540
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:20<1 day, 16:44:04]
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.79 mfu: 46.41%
542
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:01<1 day, 16:39:43]
543
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.68 mfu: 46.37%
544
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:42<1 day, 16:35:27]
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.47 mfu: 46.30%
546
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:24<1 day, 16:31:16]
547
+ [titan] 2026-01-06 21:53:26,759 - root - INFO - step: 117 loss: 7.4033 memory: 71.95GiB(90.78%) tps: 1,579 tflops: 144.46 mfu: 46.30%
548
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:06<1 day, 16:27:08]
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.42 mfu: 46.29%
550
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:47<1 day, 16:23:05]
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
552
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:29<1 day, 16:19:05]
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.32 mfu: 46.26%
554
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:10<1 day, 16:15:08]
555
+ [titan] 2026-01-06 21:56:12,903 - root - INFO - step: 121 loss: 7.3984 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.33 mfu: 46.26%
556
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:52<1 day, 16:11:15]
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
558
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:33<1 day, 16:07:25]
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
560
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:15<1 day, 16:03:38]
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
562
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:40:56<1 day, 15:59:54]
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.31 mfu: 46.25%
564
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:38<1 day, 15:56:13]
565
+ [titan] 2026-01-06 21:59:40,618 - root - INFO - step: 126 loss: 7.3625 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.33 mfu: 46.26%
566
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:19<1 day, 15:52:35]
567
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.36 mfu: 46.27%
568
+ [titan] 2026-01-06 22:00:22,156 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:01<1 day, 15:49:00]
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.35 mfu: 46.27%
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:42<1 day, 15:45:27]
571
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
572
+ [titan] 2026-01-06 22:01:24,072 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
573
+ [titan] 2026-01-06 22:01:24,072 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.38 seconds.
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - step: 129 loss: 7.2878 memory: 71.95GiB(90.78%) tps: 1,061 tflops: 97.09 mfu: 31.12%
575
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:44<1 day, 15:49:38]
576
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.76 mfu: 46.40%
577
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:26<1 day, 15:46:04]
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.61 mfu: 46.35%
579
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:07<1 day, 15:42:34]
580
+ [titan] 2026-01-06 22:04:09,858 - root - INFO - step: 132 loss: 7.4566 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.42 mfu: 46.29%
581
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:49<1 day, 15:39:08]
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.39 mfu: 46.28%
583
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:30<1 day, 15:35:44]
584
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
585
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:12<1 day, 15:32:23]
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.30 mfu: 46.25%
587
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:53<1 day, 15:29:04]
588
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.29 mfu: 46.25%
589
+ [titan] 2026-01-06 22:06:56,028 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:35<1 day, 15:25:48]
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.30 mfu: 46.25%
591
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:16<1 day, 15:22:34]
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.32 mfu: 46.26%
593
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:50:58<1 day, 15:19:22]
594
+ [titan] 2026-01-06 22:09:00,687 - root - INFO - step: 139 loss: 7.4281 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.28 mfu: 46.24%
595
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:39<1 day, 15:16:13]
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.34 mfu: 46.26%
597
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:21<1 day, 15:13:05]
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.27 mfu: 46.24%
599
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:03<1 day, 15:10:00]
600
+ [titan] 2026-01-06 22:11:05,350 - root - INFO - step: 142 loss: 7.4047 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.28 mfu: 46.24%
601
+ [titan] 2026-01-06 22:11:05,351 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:44<1 day, 15:06:57]
602
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.30 mfu: 46.25%
603
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:26<1 day, 15:03:55]
604
+ [titan] 2026-01-06 22:12:28,460 - root - INFO - step: 144 loss: 7.4217 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.29 mfu: 46.25%
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:07<1 day, 15:00:56]
606
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
607
+ [titan] 2026-01-06 22:12:50,136 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
608
+ [titan] 2026-01-06 22:12:50,136 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.68 seconds.
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.95GiB(90.78%) tps: 1,039 tflops: 95.10 mfu: 30.48%
610
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:10<1 day, 15:05:12]
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.71 mfu: 46.38%
612
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:52<1 day, 15:02:11]
613
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.95GiB(90.78%) tps: 1,582 tflops: 144.74 mfu: 46.39%
614
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:33<1 day, 14:59:12]
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.95GiB(90.78%) tps: 1,581 tflops: 144.64 mfu: 46.36%
616
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:15<1 day, 14:56:15]
617
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.95GiB(90.78%) tps: 1,578 tflops: 144.37 mfu: 46.27%
618
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:58:56<1 day, 14:53:21]
619
+ [titan] 2026-01-06 22:16:17,386 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.95GiB(90.78%) tps: 1,577 tflops: 144.25 mfu: 46.23%
621
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:38<1 day, 14:50:30]
logs/none_4cvjdbqa/attempt_0/0/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:29,972 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:29,975 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:29,977 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:29,977 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:29,977 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,129 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,335 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,717 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,764 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,764 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,447 - root - INFO - [GC] GC collection for checkpoint loading. 0.02 seconds.
273
+ [titan] 2026-01-06 20:24:20,447 - root - INFO - Finished loading the checkpoint in 48.68 seconds.
274
+ [titan] 2026-01-06 20:24:20,634 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,637 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,307 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,307 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,307 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,307 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,307 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,307 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,307 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,307 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,308 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,308 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,308 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:50:11]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:52:52]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:54:34]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:31:39]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:47<5 days, 22:56:40]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:48]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:10<4 days, 19:57:50]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:58:04]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:46:07]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:36]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:52]
315
+ [titan] 2026-01-06 20:38:53,462 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:25]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:27]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:58]
321
+ [titan] 2026-01-06 20:40:57,973 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:38]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,025 - root - INFO - [GC] GC collection invoked by checkpointer. 0.58 seconds.
325
+ [titan] 2026-01-06 20:41:27,025 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.05 seconds.
326
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:39]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:19:01]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,925 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:27]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:44]
334
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:57:05]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:10]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:02<2 days, 16:08:53]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:13]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:23]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:43]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,179 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:40]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:43]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:26]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:27]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:29]
356
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:11]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,193 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
360
+ [titan] 2026-01-06 20:52:52,193 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.28 seconds.
361
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:16]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:59<2 days, 7:04:39]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:10]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:22<2 days, 5:55:41]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:58]
371
+ [titan] 2026-01-06 20:57:01,138 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,138 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:54]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:20]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:09]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:16]
379
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:34]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:44:00]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:29]
385
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:56]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:17]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:30]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:30]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,564 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
395
+ [titan] 2026-01-06 21:04:16,565 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
396
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:45]
398
+ [titan] 2026-01-06 21:04:57,982 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,422 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:42]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:20]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:40]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:37]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:08]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:12]
411
+ [titan] 2026-01-06 21:09:48,535 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:45]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:49]
415
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:20]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:17]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:38]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:23]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:30]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:59]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:48]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,933 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
431
+ [titan] 2026-01-06 21:15:41,934 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:35:00]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:07]
436
+ [titan] 2026-01-06 21:17:46,152 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:34]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,651 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:19]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:22]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:41]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:16]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:05]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:09]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:26]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:55]
454
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:37]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,442 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:31]
458
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:36]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:52]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:30<1 day, 19:19:19]
464
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,364 - root - INFO - [GC] GC collection invoked by checkpointer. 0.23 seconds.
466
+ [titan] 2026-01-06 21:27:08,364 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.31 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:31]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:04]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:47]
473
+ [titan] 2026-01-06 21:29:54,023 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:38<1 day, 19:03:40]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:43]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:01<1 day, 18:49:55]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:16]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:24<1 day, 18:36:46]
483
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:23]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:08]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:18:00]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:12:00]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:06]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:18]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:36]
497
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:02]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,931 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
501
+ [titan] 2026-01-06 21:38:32,931 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.68 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:02]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:30]
506
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:06]
508
+ [titan] 2026-01-06 21:40:37,204 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:48]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:35]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:28]
515
+ [titan] 2026-01-06 21:43:23,264 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:26]
517
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:29]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:38]
521
+ [titan] 2026-01-06 21:45:27,837 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:50]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:08]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:30]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:56]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:27]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:02]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
534
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:21<1 day, 16:40:41]
535
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,585 - root - INFO - [GC] GC collection invoked by checkpointer. 0.15 seconds.
537
+ [titan] 2026-01-06 21:49:59,585 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.53 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.93 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:07]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:45]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.68 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:28]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:16]
546
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:07]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:03]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:02]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:04]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:10]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:19]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:31]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:46]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:43<1 day, 15:58:05]
564
+ [titan] 2026-01-06 21:59:40,618 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:26]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,156 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:49]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:16]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,122 - root - INFO - [GC] GC collection invoked by checkpointer. 0.25 seconds.
572
+ [titan] 2026-01-06 22:01:24,122 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.43 seconds.
573
+ [titan] 2026-01-06 22:02:05,453 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:26]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:51]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:20]
579
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:53]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:28]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:06]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:47]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:30]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:15]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:02]
593
+ [titan] 2026-01-06 22:09:00,687 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:52]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:44]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:38]
599
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,350 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:34]
601
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:32]
603
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:32]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,175 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
607
+ [titan] 2026-01-06 22:12:50,175 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.71 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:48]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:46]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:46]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:48]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:54]
618
+ [titan] 2026-01-06 22:16:17,390 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:02]
logs/none_4cvjdbqa/attempt_0/1/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:30,013 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:30,016 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:30,018 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:30,018 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:30,018 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,128 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,185 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,335 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,714 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,762 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,763 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,502 - root - INFO - [GC] GC collection for checkpoint loading. 0.04 seconds.
273
+ [titan] 2026-01-06 20:24:20,502 - root - INFO - Finished loading the checkpoint in 48.74 seconds.
274
+ [titan] 2026-01-06 20:24:20,704 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,707 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,476 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,476 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,476 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,477 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:48:25]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:51:41]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:53:40]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,590 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:30:56]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:46<5 days, 22:56:04]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:17]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:09<4 days, 19:57:23]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:57:40]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:45:46]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:17]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:34]
315
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:08]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:12]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:44]
321
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:25]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,085 - root - INFO - [GC] GC collection invoked by checkpointer. 0.64 seconds.
325
+ [titan] 2026-01-06 20:41:27,085 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.11 seconds.
326
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:26]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:18:49]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,925 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:15]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:33]
334
+ [titan] 2026-01-06 20:44:54,967 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:56:54]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:00]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:01<2 days, 16:08:44]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:04]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:15]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:35]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:32]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:35]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:19]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:20]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:22]
356
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:04]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,182 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
360
+ [titan] 2026-01-06 20:52:52,182 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.27 seconds.
361
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:09]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:58<2 days, 7:04:33]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:04]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:21<2 days, 5:55:35]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:52]
371
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,138 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:48]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:14]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:04]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:11]
379
+ [titan] 2026-01-06 20:59:47,255 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:29]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:43:55]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:24]
385
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:51]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:13]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:26]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:26]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,571 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
395
+ [titan] 2026-01-06 21:04:16,571 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
396
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:41]
398
+ [titan] 2026-01-06 21:04:57,982 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:38]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:16]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:36]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:33]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:04]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:08]
411
+ [titan] 2026-01-06 21:09:48,535 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:42]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:46]
415
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:16]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:13]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:35]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:19]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:27]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:56]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:45]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,931 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
431
+ [titan] 2026-01-06 21:15:41,932 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:34:57]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:04]
436
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:31]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:16]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:19]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:38]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:13]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:03]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:06]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:23]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,374 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:52]
454
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:34]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,442 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:28]
458
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:33]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:49]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:29<1 day, 19:19:16]
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,315 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
466
+ [titan] 2026-01-06 21:27:08,316 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:29]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:01]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:44]
473
+ [titan] 2026-01-06 21:29:54,023 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:37<1 day, 19:03:37]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:41]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:00<1 day, 18:49:53]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:14]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:23<1 day, 18:36:43]
483
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:21]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:06]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:17:58]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:11:58]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:04]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:16]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:34]
497
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:00]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,928 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
501
+ [titan] 2026-01-06 21:38:32,928 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.68 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:00]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:28]
506
+ [titan] 2026-01-06 21:40:37,179 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:04]
508
+ [titan] 2026-01-06 21:40:37,201 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:46]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:33]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:26]
515
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:24]
517
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:27]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:36]
521
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:48]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:06]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:28]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:55]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:25]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:00]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
534
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:20<1 day, 16:40:39]
535
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,573 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
537
+ [titan] 2026-01-06 21:49:59,574 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.93 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:06]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:44]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.68 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:26]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:14]
546
+ [titan] 2026-01-06 21:53:26,759 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:06]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:01]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:00]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:03]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:09]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:18]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:30]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:45]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:42<1 day, 15:58:03]
564
+ [titan] 2026-01-06 21:59:40,618 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:24]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:48]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:14]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,068 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
572
+ [titan] 2026-01-06 22:01:24,068 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.37 seconds.
573
+ [titan] 2026-01-06 22:02:05,453 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:24]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:50]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:18]
579
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:51]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:27]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:04]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:45]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,028 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:28]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:14]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:01]
593
+ [titan] 2026-01-06 22:09:00,687 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:51]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:42]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:36]
599
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:33]
601
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:31]
603
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:30]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,172 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
607
+ [titan] 2026-01-06 22:12:50,172 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.71 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:46]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:44]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:44]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:47]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:52]
618
+ [titan] 2026-01-06 22:16:17,393 - root - INFO - [GC] Peforming periodical GC collection. 0.04 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:01]
logs/none_4cvjdbqa/attempt_0/2/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:29,998 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:30,002 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:30,004 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:30,004 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:30,004 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,129 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,334 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,714 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,761 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,763 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,456 - root - INFO - [GC] GC collection for checkpoint loading. 0.02 seconds.
273
+ [titan] 2026-01-06 20:24:20,457 - root - INFO - Finished loading the checkpoint in 48.69 seconds.
274
+ [titan] 2026-01-06 20:24:20,674 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,676 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,243 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,243 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,243 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,243 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:49:11]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:52:12]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:54:04]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,590 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:31:15]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:47<5 days, 22:56:20]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:31]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:09<4 days, 19:57:35]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:57:50]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:45:55]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:25]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,961 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:42]
315
+ [titan] 2026-01-06 20:38:53,462 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:15]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:19]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:50]
321
+ [titan] 2026-01-06 20:40:57,973 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:30]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,071 - root - INFO - [GC] GC collection invoked by checkpointer. 0.63 seconds.
325
+ [titan] 2026-01-06 20:41:27,071 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.10 seconds.
326
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:32]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:18:54]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,925 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:20]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:38]
334
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:56:59]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:05]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:02<2 days, 16:08:48]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:08]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:18]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:38]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,179 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:36]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:38]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:22]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:23]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:25]
356
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:07]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,190 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
360
+ [titan] 2026-01-06 20:52:52,190 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.28 seconds.
361
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:12]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:59<2 days, 7:04:35]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:07]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:22<2 days, 5:55:37]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:55]
371
+ [titan] 2026-01-06 20:57:01,138 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,138 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:51]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:16]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:06]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:13]
379
+ [titan] 2026-01-06 20:59:47,255 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:31]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:43:57]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:26]
385
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:53]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:15]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:28]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:28]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,564 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
395
+ [titan] 2026-01-06 21:04:16,564 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
396
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:43]
398
+ [titan] 2026-01-06 21:04:57,981 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:39]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:18]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:38]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:34]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:05]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:09]
411
+ [titan] 2026-01-06 21:09:48,535 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:43]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:47]
415
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:18]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,170 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:15]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:36]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:21]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:28]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:57]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:46]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,928 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
431
+ [titan] 2026-01-06 21:15:41,928 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:34:58]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:05]
436
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:32]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,651 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:18]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:20]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:39]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:14]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:04]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:07]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:24]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:54]
454
+ [titan] 2026-01-06 21:23:59,908 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:35]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:29]
458
+ [titan] 2026-01-06 21:25:22,982 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:34]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:50]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:29<1 day, 19:19:17]
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,316 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
466
+ [titan] 2026-01-06 21:27:08,316 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:30]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:02]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:45]
473
+ [titan] 2026-01-06 21:29:54,023 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:37<1 day, 19:03:38]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:42]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:00<1 day, 18:49:54]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:15]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:24<1 day, 18:36:44]
483
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:22]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:07]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:17:59]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:11:58]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:05]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:16]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:35]
497
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:01]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,933 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
501
+ [titan] 2026-01-06 21:38:32,933 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.69 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:01]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:29]
506
+ [titan] 2026-01-06 21:40:37,179 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:04]
508
+ [titan] 2026-01-06 21:40:37,205 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:47]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:34]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:27]
515
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:25]
517
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:28]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:36]
521
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:49]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:07]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:29]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:55]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:26]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:01]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
534
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:20<1 day, 16:40:40]
535
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,574 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
537
+ [titan] 2026-01-06 21:49:59,574 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.93 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:06]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:44]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.68 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:27]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:15]
546
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:06]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:02]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:01]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:03]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:09]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:18]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:30]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:45]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:42<1 day, 15:58:04]
564
+ [titan] 2026-01-06 21:59:40,618 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:25]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,156 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:48]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:15]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,075 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
572
+ [titan] 2026-01-06 22:01:24,076 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.38 seconds.
573
+ [titan] 2026-01-06 22:02:05,453 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:25]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:50]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:19]
579
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:52]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:27]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:05]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:46]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:29]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:14]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:02]
593
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:51]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:43]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:37]
599
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,350 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:33]
601
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:31]
603
+ [titan] 2026-01-06 22:12:28,460 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:31]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,173 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
607
+ [titan] 2026-01-06 22:12:50,174 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.71 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:47]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:45]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:45]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:47]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:53]
618
+ [titan] 2026-01-06 22:16:17,394 - root - INFO - [GC] Peforming periodical GC collection. 0.04 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:01]
logs/none_4cvjdbqa/attempt_0/3/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:29,971 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:29,975 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:29,977 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:29,977 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:29,977 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,052 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,129 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,185 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,335 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,717 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,764 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,764 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,451 - root - INFO - [GC] GC collection for checkpoint loading. 0.03 seconds.
273
+ [titan] 2026-01-06 20:24:20,451 - root - INFO - Finished loading the checkpoint in 48.69 seconds.
274
+ [titan] 2026-01-06 20:24:20,662 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,664 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,325 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,325 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,325 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,326 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,326 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:49:30]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:52:24]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:54:13]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:31:23]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:47<5 days, 22:56:26]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:36]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:09<4 days, 19:57:39]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:57:54]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:45:59]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:29]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,961 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:45]
315
+ [titan] 2026-01-06 20:38:53,462 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:18]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:21]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:53]
321
+ [titan] 2026-01-06 20:40:57,973 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:33]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,081 - root - INFO - [GC] GC collection invoked by checkpointer. 0.64 seconds.
325
+ [titan] 2026-01-06 20:41:27,081 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.11 seconds.
326
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:34]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:18:56]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,925 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:22]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:39]
334
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:57:01]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:06]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:02<2 days, 16:08:50]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:09]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:20]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:40]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,179 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:37]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:40]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:23]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:24]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:26]
356
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:08]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,195 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
360
+ [titan] 2026-01-06 20:52:52,196 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.29 seconds.
361
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:13]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:59<2 days, 7:04:37]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:08]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:22<2 days, 5:55:38]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:56]
371
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:52]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:18]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:07]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:14]
379
+ [titan] 2026-01-06 20:59:47,255 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:32]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:43:58]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:27]
385
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:54]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:16]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:29]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:29]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,566 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
395
+ [titan] 2026-01-06 21:04:16,566 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
396
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:43]
398
+ [titan] 2026-01-06 21:04:57,981 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,422 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:40]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:19]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:39]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:35]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:06]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:10]
411
+ [titan] 2026-01-06 21:09:48,535 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:44]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.25%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:48]
415
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:19]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,170 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:15]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,709 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:37]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:21]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:29]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:58]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:47]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,925 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
431
+ [titan] 2026-01-06 21:15:41,925 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.05 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:34:59]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:06]
436
+ [titan] 2026-01-06 21:17:46,152 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:33]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:18]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:21]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,687 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:40]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:15]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:04]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:08]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:24]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:54]
454
+ [titan] 2026-01-06 21:23:59,908 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:36]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:30]
458
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:35]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:51]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:30<1 day, 19:19:18]
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,314 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
466
+ [titan] 2026-01-06 21:27:08,314 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.26 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:31]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:03]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:46]
473
+ [titan] 2026-01-06 21:29:54,023 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:37<1 day, 19:03:39]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:42]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:00<1 day, 18:49:55]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:15]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:24<1 day, 18:36:45]
483
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:22]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:07]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:18:00]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:11:59]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:05]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:17]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:36]
497
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:02]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,925 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
501
+ [titan] 2026-01-06 21:38:32,925 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.68 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:02]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:29]
506
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:05]
508
+ [titan] 2026-01-06 21:40:37,207 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:47]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:35]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:27]
515
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:26]
517
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:29]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:37]
521
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:50]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:07]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:29]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:56]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:26]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:01]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
534
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:20<1 day, 16:40:40]
535
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,564 - root - INFO - [GC] GC collection invoked by checkpointer. 0.13 seconds.
537
+ [titan] 2026-01-06 21:49:59,565 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.93 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:07]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:45]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.68 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:27]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:15]
546
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:07]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:02]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:01]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,361 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:04]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:10]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:19]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:31]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:46]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:43<1 day, 15:58:04]
564
+ [titan] 2026-01-06 21:59:40,618 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:25]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,156 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:49]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:15]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,083 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
572
+ [titan] 2026-01-06 22:01:24,083 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.39 seconds.
573
+ [titan] 2026-01-06 22:02:05,453 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:25]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:51]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:19]
579
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:52]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:27]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:05]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:46]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:29]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:14]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:02]
593
+ [titan] 2026-01-06 22:09:00,687 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:52]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:43]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:37]
599
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:33]
601
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:31]
603
+ [titan] 2026-01-06 22:12:28,460 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:31]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,153 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
607
+ [titan] 2026-01-06 22:12:50,153 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.69 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:47]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:45]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:45]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:48]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:53]
618
+ [titan] 2026-01-06 22:16:17,392 - root - INFO - [GC] Peforming periodical GC collection. 0.04 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:01]
logs/none_4cvjdbqa/attempt_0/4/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,615 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:29,968 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:29,972 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:29,974 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:29,974 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:29,974 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,052 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,129 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,333 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,716 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,763 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,763 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,481 - root - INFO - [GC] GC collection for checkpoint loading. 0.02 seconds.
273
+ [titan] 2026-01-06 20:24:20,481 - root - INFO - Finished loading the checkpoint in 48.72 seconds.
274
+ [titan] 2026-01-06 20:24:20,697 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,699 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,481 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,481 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,481 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,481 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,481 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,481 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,482 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,482 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,482 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,482 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,482 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:48:36]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:51:49]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:53:46]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:31:01]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:46<5 days, 22:56:08]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:21]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:09<4 days, 19:57:26]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:57:42]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:45:48]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:19]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:36]
315
+ [titan] 2026-01-06 20:38:53,462 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:10]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:14]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:46]
321
+ [titan] 2026-01-06 20:40:57,973 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:26]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,037 - root - INFO - [GC] GC collection invoked by checkpointer. 0.60 seconds.
325
+ [titan] 2026-01-06 20:41:27,037 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.06 seconds.
326
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:28]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:18:50]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:17]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:34]
334
+ [titan] 2026-01-06 20:44:54,967 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:56:56]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:02]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:01<2 days, 16:08:45]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:05]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:16]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:36]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:33]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:36]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:20]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:21]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:23]
356
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:05]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,195 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
360
+ [titan] 2026-01-06 20:52:52,195 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.29 seconds.
361
+ [titan] 2026-01-06 20:53:33,590 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:10]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:58<2 days, 7:04:33]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:05]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:22<2 days, 5:55:35]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:53]
371
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:49]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:15]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:05]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:12]
379
+ [titan] 2026-01-06 20:59:47,255 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:30]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:43:56]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:25]
385
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:52]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:13]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:27]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:27]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,632 - root - INFO - [GC] GC collection invoked by checkpointer. 0.24 seconds.
395
+ [titan] 2026-01-06 21:04:16,632 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.14 seconds.
396
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:41]
398
+ [titan] 2026-01-06 21:04:57,979 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:38]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:17]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:37]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:33]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:04]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:08]
411
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:42]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:46]
415
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:17]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:14]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:35]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:20]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:28]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:56]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:45]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,925 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
431
+ [titan] 2026-01-06 21:15:41,925 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.05 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:34:57]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:05]
436
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:31]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,651 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:17]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:20]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:39]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:13]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:03]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:06]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:23]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:53]
454
+ [titan] 2026-01-06 21:23:59,908 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:35]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,442 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:28]
458
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:34]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:50]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:29<1 day, 19:19:16]
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,316 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
466
+ [titan] 2026-01-06 21:27:08,317 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:29]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:02]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:45]
473
+ [titan] 2026-01-06 21:29:54,023 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:37<1 day, 19:03:38]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:41]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:00<1 day, 18:49:53]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:14]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:23<1 day, 18:36:44]
483
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:21]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:06]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:17:58]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:11:58]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:04]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:16]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:35]
497
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:01]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,921 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
501
+ [titan] 2026-01-06 21:38:32,922 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.67 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:01]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:28]
506
+ [titan] 2026-01-06 21:40:37,179 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:04]
508
+ [titan] 2026-01-06 21:40:37,200 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:46]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:34]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:26]
515
+ [titan] 2026-01-06 21:43:23,264 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:25]
517
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:28]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:36]
521
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:49]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:06]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:28]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:55]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:25]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:00]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.25%
534
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:20<1 day, 16:40:39]
535
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,578 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
537
+ [titan] 2026-01-06 21:49:59,578 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.52 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.93 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:06]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:44]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.68 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:26]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:14]
546
+ [titan] 2026-01-06 21:53:26,759 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:06]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:01]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:00]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:03]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:09]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:18]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:30]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:45]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:42<1 day, 15:58:03]
564
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:24]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,156 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:48]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:14]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,082 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
572
+ [titan] 2026-01-06 22:01:24,082 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.39 seconds.
573
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:25]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:50]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:19]
579
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:51]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:27]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:05]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:45]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:28]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:14]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:01]
593
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:51]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:43]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:37]
599
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:33]
601
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:31]
603
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:31]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,164 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
607
+ [titan] 2026-01-06 22:12:50,164 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.70 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:46]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:45]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:45]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:47]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:53]
618
+ [titan] 2026-01-06 22:16:17,386 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:01]
logs/none_4cvjdbqa/attempt_0/5/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:29,958 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:29,965 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:29,967 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:29,967 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:29,967 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,452 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,452 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,452 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,979 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,983 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,984 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,130 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,188 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,352 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,724 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,773 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,773 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,506 - root - INFO - [GC] GC collection for checkpoint loading. 0.04 seconds.
273
+ [titan] 2026-01-06 20:24:20,506 - root - INFO - Finished loading the checkpoint in 48.73 seconds.
274
+ [titan] 2026-01-06 20:24:20,703 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,706 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,545 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,545 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,545 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,545 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,545 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,545 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,546 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,546 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,546 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,546 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,546 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:48:26]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:51:42]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:53:41]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,590 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:30:57]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:46<5 days, 22:56:05]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:18]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:09<4 days, 19:57:23]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,481 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:57:40]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:45:46]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:17]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:34]
315
+ [titan] 2026-01-06 20:38:53,462 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:08]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:12]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:44]
321
+ [titan] 2026-01-06 20:40:57,973 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:25]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,686 - root - INFO - [GC] GC collection invoked by checkpointer. 1.24 seconds.
325
+ [titan] 2026-01-06 20:41:27,686 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.71 seconds.
326
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:27]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:18:49]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,925 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:16]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,452 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:33]
334
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:56:55]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:00]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:01<2 days, 16:08:44]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:04]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:15]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:35]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,179 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:32]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:35]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,275 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:19]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:20]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:22]
356
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:04]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,184 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
360
+ [titan] 2026-01-06 20:52:52,185 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.27 seconds.
361
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:09]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:58<2 days, 7:04:33]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:04]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:21<2 days, 5:55:35]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:52]
371
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:48]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:14]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:04]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:11]
379
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:29]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:43:55]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:24]
385
+ [titan] 2026-01-06 21:01:51,868 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:51]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:13]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:26]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:26]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,560 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
395
+ [titan] 2026-01-06 21:04:16,560 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.06 seconds.
396
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:41]
398
+ [titan] 2026-01-06 21:04:57,979 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,422 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:38]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:16]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:36]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:33]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:04]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:08]
411
+ [titan] 2026-01-06 21:09:48,535 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:42]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:45]
415
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:16]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,170 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:13]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:35]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:19]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:27]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:56]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:45]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,932 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
431
+ [titan] 2026-01-06 21:15:41,932 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:34:57]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:04]
436
+ [titan] 2026-01-06 21:17:46,152 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:31]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,651 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:16]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:19]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,687 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:38]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:13]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:02]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:06]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:23]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,374 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:52]
454
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:34]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,442 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:28]
458
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:33]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:49]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:29<1 day, 19:19:16]
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,317 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
466
+ [titan] 2026-01-06 21:27:08,318 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:29]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:01]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:44]
473
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:37<1 day, 19:03:37]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:41]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:00<1 day, 18:49:53]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:14]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:23<1 day, 18:36:43]
483
+ [titan] 2026-01-06 21:33:21,598 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:21]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:06]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:17:58]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:11:58]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,713 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:04]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:16]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:34]
497
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:00]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,953 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
501
+ [titan] 2026-01-06 21:38:32,954 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.71 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:00]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:28]
506
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:03]
508
+ [titan] 2026-01-06 21:40:37,206 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:46]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:33]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:26]
515
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:24]
517
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:27]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:36]
521
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:48]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:06]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:28]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:54]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:25]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:00]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
534
+ [titan] 2026-01-06 21:49:37,060 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:20<1 day, 16:40:39]
535
+ [titan] 2026-01-06 21:49:37,060 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,579 - root - INFO - [GC] GC collection invoked by checkpointer. 0.15 seconds.
537
+ [titan] 2026-01-06 21:49:59,579 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.52 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.94 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:05]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:43]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:26]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:14]
546
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:05]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:01]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:00]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:03]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:09]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:18]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:30]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:45]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:42<1 day, 15:58:03]
564
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:24]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,156 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:47]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:14]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,077 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
572
+ [titan] 2026-01-06 22:01:24,078 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.38 seconds.
573
+ [titan] 2026-01-06 22:02:05,453 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:24]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:49]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:18]
579
+ [titan] 2026-01-06 22:04:09,858 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:51]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:26]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:04]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:45]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:28]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:13]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:01]
593
+ [titan] 2026-01-06 22:09:00,687 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:51]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:42]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:36]
599
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,349 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:32]
601
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:30]
603
+ [titan] 2026-01-06 22:12:28,460 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:30]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,165 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
607
+ [titan] 2026-01-06 22:12:50,166 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.70 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:46]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:44]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:44]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:47]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:52]
618
+ [titan] 2026-01-06 22:16:17,387 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:00]
logs/none_4cvjdbqa/attempt_0/6/stdout.log ADDED
File without changes
logs/none_4cvjdbqa/attempt_0/7/stderr.log ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-06 20:23:28,613 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 16,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 3072,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-06 20:23:29,962 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-06 20:23:29,965 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-06 20:23:29,967 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-06 20:23:29,967 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-06 20:23:29,967 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-06 20:23:30,424 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-06 20:23:30,424 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-06 20:23:30,424 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-06 20:23:31,128 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-06 20:23:31,189 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-06 20:23:31,189 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-06 20:23:31,190 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-06 20:23:31,333 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-06 20:23:31,724 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-06 20:23:31,773 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-06 20:23:31,774 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
272
+ [titan] 2026-01-06 20:24:20,476 - root - INFO - [GC] GC collection for checkpoint loading. 0.03 seconds.
273
+ [titan] 2026-01-06 20:24:20,476 - root - INFO - Finished loading the checkpoint in 48.70 seconds.
274
+ [titan] 2026-01-06 20:24:20,701 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-06 20:24:20,704 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-06 20:24:23,285 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-06 20:24:23,286 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Training starts at step 2
279
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Total optimization steps = 3,072 (1,610,612,736 tokens)
284
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-06 20:24:23,286 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-06 20:24:23,286 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - step: 2 loss: 14.3989 memory: 71.94GiB(90.77%) tps: 157 tflops: 14.38 mfu: 4.61%
294
+ [titan] 2026-01-06 20:31:17,558 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:14:01<14 days, 22:48:29]
295
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - step: 3 loss: 14.3925 memory: 71.94GiB(90.77%) tps: 1,587 tflops: 145.20 mfu: 46.54%
296
+ [titan] 2026-01-06 20:31:58,854 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:14:42<10 days, 10:51:44]
297
+ [titan] 2026-01-06 20:32:40,204 - root - INFO - step: 4 loss: 14.2932 memory: 71.94GiB(90.77%) tps: 1,585 tflops: 145.01 mfu: 46.48%
298
+ [titan] 2026-01-06 20:32:40,205 - root - INFO - lr: 1.9531e-06 gnorm: 125.50 [ 0:15:24<8 days, 4:53:43]
299
+ [titan] 2026-01-06 20:33:21,589 - root - INFO - step: 5 loss: 14.2679 memory: 71.94GiB(90.77%) tps: 1,584 tflops: 144.89 mfu: 46.44%
300
+ [titan] 2026-01-06 20:33:21,590 - root - INFO - lr: 2.3438e-06 gnorm: 123.50 [ 0:16:05<6 days, 20:30:59]
301
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - step: 6 loss: 13.9921 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
302
+ [titan] 2026-01-06 20:34:03,035 - root - INFO - lr: 2.7344e-06 gnorm: 117.50 [ 0:16:46<5 days, 22:56:06]
303
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - step: 7 loss: 13.8102 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
304
+ [titan] 2026-01-06 20:34:44,524 - root - INFO - lr: 3.1250e-06 gnorm: 112.50 [ 0:17:28<5 days, 7:31:19]
305
+ [titan] 2026-01-06 20:35:25,989 - root - INFO - step: 8 loss: 13.5609 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.60 mfu: 46.35%
306
+ [titan] 2026-01-06 20:35:25,990 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:18:09<4 days, 19:57:24]
307
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - step: 9 loss: 13.3683 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
308
+ [titan] 2026-01-06 20:36:07,480 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:18:51<4 days, 10:57:41]
309
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - step: 10 loss: 13.1018 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.32%
310
+ [titan] 2026-01-06 20:36:48,975 - root - INFO - lr: 4.2969e-06 gnorm: 94.00 [ 0:19:32<4 days, 3:45:47]
311
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - step: 11 loss: 12.5407 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
312
+ [titan] 2026-01-06 20:37:30,471 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:20:14<3 days, 21:52:18]
313
+ [titan] 2026-01-06 20:38:11,960 - root - INFO - step: 12 loss: 12.0106 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
314
+ [titan] 2026-01-06 20:38:11,961 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:20:55<3 days, 16:57:35]
315
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - step: 13 loss: 11.5957 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
316
+ [titan] 2026-01-06 20:38:53,463 - root - INFO - lr: 5.4687e-06 gnorm: 68.00 [ 0:21:37<3 days, 12:48:09]
317
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - step: 14 loss: 11.2380 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.51 mfu: 46.32%
318
+ [titan] 2026-01-06 20:39:34,955 - root - INFO - lr: 5.8594e-06 gnorm: 63.25 [ 0:22:18<3 days, 9:14:13]
319
+ [titan] 2026-01-06 20:40:16,456 - root - INFO - step: 15 loss: 10.9153 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.48 mfu: 46.31%
320
+ [titan] 2026-01-06 20:40:16,457 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:23:00<3 days, 6:08:45]
321
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - step: 16 loss: 10.6864 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.42 mfu: 46.29%
322
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - lr: 6.6406e-06 gnorm: 57.00 [ 0:23:41<3 days, 3:26:25]
323
+ [titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
324
+ [titan] 2026-01-06 20:41:27,201 - root - INFO - [GC] GC collection invoked by checkpointer. 0.76 seconds.
325
+ [titan] 2026-01-06 20:41:27,201 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.23 seconds.
326
+ [titan] 2026-01-06 20:42:08,985 - root - INFO - step: 17 loss: 10.3828 memory: 71.94GiB(90.77%) tps: 923 tflops: 84.44 mfu: 27.06%
327
+ [titan] 2026-01-06 20:42:08,986 - root - INFO - lr: 7.0313e-06 gnorm: 42.50 [ 0:24:52<3 days, 2:31:27]
328
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - step: 18 loss: 10.1659 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
329
+ [titan] 2026-01-06 20:42:50,422 - root - INFO - lr: 7.4219e-06 gnorm: 32.50 [ 0:25:34<3 days, 0:18:50]
330
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - step: 19 loss: 9.9749 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.31%
331
+ [titan] 2026-01-06 20:43:31,924 - root - INFO - lr: 7.8125e-06 gnorm: 26.88 [ 0:26:15<2 days, 22:20:16]
332
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - step: 20 loss: 9.8084 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
333
+ [titan] 2026-01-06 20:44:13,451 - root - INFO - lr: 8.2031e-06 gnorm: 25.62 [ 0:26:57<2 days, 20:33:33]
334
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - step: 21 loss: 9.6201 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
335
+ [titan] 2026-01-06 20:44:54,968 - root - INFO - lr: 8.5938e-06 gnorm: 26.88 [ 0:27:38<2 days, 18:56:55]
336
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - step: 22 loss: 9.4905 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
337
+ [titan] 2026-01-06 20:45:36,491 - root - INFO - lr: 8.9844e-06 gnorm: 25.50 [ 0:28:20<2 days, 17:29:01]
338
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - step: 23 loss: 9.2526 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
339
+ [titan] 2026-01-06 20:46:18,035 - root - INFO - lr: 9.3750e-06 gnorm: 19.12 [ 0:29:01<2 days, 16:08:45]
340
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - step: 24 loss: 9.0528 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
341
+ [titan] 2026-01-06 20:46:59,563 - root - INFO - lr: 9.7656e-06 gnorm: 17.00 [ 0:29:43<2 days, 14:55:04]
342
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - step: 25 loss: 8.8601 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
343
+ [titan] 2026-01-06 20:47:41,099 - root - INFO - lr: 1.0156e-05 gnorm: 14.06 [ 0:30:25<2 days, 13:47:15]
344
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - step: 26 loss: 8.7360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.27%
345
+ [titan] 2026-01-06 20:48:22,630 - root - INFO - lr: 1.0547e-05 gnorm: 15.44 [ 0:31:06<2 days, 12:44:35]
346
+ [titan] 2026-01-06 20:49:04,178 - root - INFO - step: 27 loss: 8.6182 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
347
+ [titan] 2026-01-06 20:49:04,179 - root - INFO - lr: 1.0937e-05 gnorm: 10.25 [ 0:31:48<2 days, 11:46:32]
348
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - step: 28 loss: 8.5142 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
349
+ [titan] 2026-01-06 20:49:45,725 - root - INFO - lr: 1.1328e-05 gnorm: 9.00 [ 0:32:29<2 days, 10:52:36]
350
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - step: 29 loss: 8.4770 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
351
+ [titan] 2026-01-06 20:50:27,274 - root - INFO - lr: 1.1719e-05 gnorm: 9.44 [ 0:33:11<2 days, 10:02:19]
352
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - step: 30 loss: 8.3888 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
353
+ [titan] 2026-01-06 20:51:08,813 - root - INFO - lr: 1.2109e-05 gnorm: 7.06 [ 0:33:52<2 days, 9:15:20]
354
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - step: 31 loss: 8.3098 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
355
+ [titan] 2026-01-06 20:51:50,370 - root - INFO - lr: 1.2500e-05 gnorm: 5.38 [ 0:34:34<2 days, 8:31:22]
356
+ [titan] 2026-01-06 20:52:31,909 - root - INFO - step: 32 loss: 8.2507 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
357
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - lr: 1.2891e-05 gnorm: 6.97 [ 0:35:15<2 days, 7:50:05]
358
+ [titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
359
+ [titan] 2026-01-06 20:52:52,213 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
360
+ [titan] 2026-01-06 20:52:52,213 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.30 seconds.
361
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - step: 33 loss: 8.1782 memory: 71.94GiB(90.77%) tps: 1,063 tflops: 97.21 mfu: 31.16%
362
+ [titan] 2026-01-06 20:53:33,591 - root - INFO - lr: 1.3281e-05 gnorm: 4.94 [ 0:36:17<2 days, 7:42:10]
363
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - step: 34 loss: 8.1399 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
364
+ [titan] 2026-01-06 20:54:15,059 - root - INFO - lr: 1.3672e-05 gnorm: 4.62 [ 0:36:58<2 days, 7:04:33]
365
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - step: 35 loss: 8.1046 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.53 mfu: 46.32%
366
+ [titan] 2026-01-06 20:54:56,546 - root - INFO - lr: 1.4063e-05 gnorm: 4.69 [ 0:37:40<2 days, 6:29:05]
367
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - step: 36 loss: 8.0122 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
368
+ [titan] 2026-01-06 20:55:38,070 - root - INFO - lr: 1.4453e-05 gnorm: 2.75 [ 0:38:22<2 days, 5:55:35]
369
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - step: 37 loss: 8.0874 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
370
+ [titan] 2026-01-06 20:56:19,603 - root - INFO - lr: 1.4844e-05 gnorm: 4.84 [ 0:39:03<2 days, 5:23:53]
371
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - step: 38 loss: 8.0173 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
372
+ [titan] 2026-01-06 20:57:01,137 - root - INFO - lr: 1.5234e-05 gnorm: 3.98 [ 0:39:45<2 days, 4:53:48]
373
+ [titan] 2026-01-06 20:57:42,670 - root - INFO - step: 39 loss: 8.0002 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
374
+ [titan] 2026-01-06 20:57:42,671 - root - INFO - lr: 1.5625e-05 gnorm: 3.81 [ 0:40:26<2 days, 4:25:14]
375
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - step: 40 loss: 7.9606 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
376
+ [titan] 2026-01-06 20:58:24,204 - root - INFO - lr: 1.6016e-05 gnorm: 2.86 [ 0:41:08<2 days, 3:58:04]
377
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - step: 41 loss: 7.9773 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
378
+ [titan] 2026-01-06 20:59:05,739 - root - INFO - lr: 1.6406e-05 gnorm: 3.56 [ 0:41:49<2 days, 3:32:11]
379
+ [titan] 2026-01-06 20:59:47,255 - root - INFO - step: 42 loss: 7.9890 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
380
+ [titan] 2026-01-06 20:59:47,256 - root - INFO - lr: 1.6797e-05 gnorm: 4.75 [ 0:42:31<2 days, 3:07:29]
381
+ [titan] 2026-01-06 21:00:28,788 - root - INFO - step: 43 loss: 7.9018 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
382
+ [titan] 2026-01-06 21:00:28,789 - root - INFO - lr: 1.7188e-05 gnorm: 3.48 [ 0:43:12<2 days, 2:43:55]
383
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - step: 44 loss: 7.8441 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
384
+ [titan] 2026-01-06 21:01:10,328 - root - INFO - lr: 1.7578e-05 gnorm: 3.89 [ 0:43:54<2 days, 2:21:24]
385
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - step: 45 loss: 7.8679 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
386
+ [titan] 2026-01-06 21:01:51,869 - root - INFO - lr: 1.7969e-05 gnorm: 6.41 [ 0:44:35<2 days, 1:59:51]
387
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - step: 46 loss: 7.7830 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
388
+ [titan] 2026-01-06 21:02:33,408 - root - INFO - lr: 1.8359e-05 gnorm: 3.52 [ 0:45:17<2 days, 1:39:13]
389
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - step: 47 loss: 7.8372 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
390
+ [titan] 2026-01-06 21:03:14,961 - root - INFO - lr: 1.8750e-05 gnorm: 2.22 [ 0:45:58<2 days, 1:19:26]
391
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - step: 48 loss: 7.8147 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
392
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - lr: 1.9141e-05 gnorm: 3.70 [ 0:46:40<2 days, 1:00:26]
393
+ [titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
394
+ [titan] 2026-01-06 21:04:16,562 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
395
+ [titan] 2026-01-06 21:04:16,562 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
396
+ [titan] 2026-01-06 21:04:57,969 - root - INFO - step: 49 loss: 7.6970 memory: 71.94GiB(90.77%) tps: 1,066 tflops: 97.54 mfu: 31.26%
397
+ [titan] 2026-01-06 21:04:57,970 - root - INFO - lr: 1.9531e-05 gnorm: 5.28 [ 0:47:41<2 days, 1:02:41]
398
+ [titan] 2026-01-06 21:04:57,981 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
399
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - step: 50 loss: 7.7536 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.65 mfu: 46.36%
400
+ [titan] 2026-01-06 21:05:39,421 - root - INFO - lr: 1.9922e-05 gnorm: 4.06 [ 0:48:23<2 days, 0:44:38]
401
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - step: 51 loss: 7.7578 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
402
+ [titan] 2026-01-06 21:06:20,891 - root - INFO - lr: 2.0313e-05 gnorm: 5.03 [ 0:49:04<2 days, 0:27:17]
403
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - step: 52 loss: 7.7586 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
404
+ [titan] 2026-01-06 21:07:02,402 - root - INFO - lr: 2.0703e-05 gnorm: 2.52 [ 0:49:46<2 days, 0:10:36]
405
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - step: 53 loss: 7.7823 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
406
+ [titan] 2026-01-06 21:07:43,930 - root - INFO - lr: 2.1094e-05 gnorm: 11.69 [ 0:50:27<1 day, 23:54:33]
407
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - step: 54 loss: 7.7454 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
408
+ [titan] 2026-01-06 21:08:25,460 - root - INFO - lr: 2.1484e-05 gnorm: 10.25 [ 0:51:09<1 day, 23:39:04]
409
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - step: 55 loss: 7.6959 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
410
+ [titan] 2026-01-06 21:09:07,002 - root - INFO - lr: 2.1875e-05 gnorm: 3.77 [ 0:51:50<1 day, 23:24:08]
411
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - step: 56 loss: 7.7100 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
412
+ [titan] 2026-01-06 21:09:48,536 - root - INFO - lr: 2.2266e-05 gnorm: 5.50 [ 0:52:32<1 day, 23:09:42]
413
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - step: 57 loss: 7.6427 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
414
+ [titan] 2026-01-06 21:10:30,084 - root - INFO - lr: 2.2656e-05 gnorm: 3.45 [ 0:53:14<1 day, 22:55:46]
415
+ [titan] 2026-01-06 21:11:11,627 - root - INFO - step: 58 loss: 7.7081 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
416
+ [titan] 2026-01-06 21:11:11,628 - root - INFO - lr: 2.3047e-05 gnorm: 7.88 [ 0:53:55<1 day, 22:42:17]
417
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - step: 59 loss: 7.6955 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
418
+ [titan] 2026-01-06 21:11:53,169 - root - INFO - lr: 2.3438e-05 gnorm: 7.16 [ 0:54:37<1 day, 22:29:13]
419
+ [titan] 2026-01-06 21:12:34,708 - root - INFO - step: 60 loss: 7.6458 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
420
+ [titan] 2026-01-06 21:12:34,709 - root - INFO - lr: 2.3828e-05 gnorm: 3.22 [ 0:55:18<1 day, 22:16:35]
421
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - step: 61 loss: 7.6709 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
422
+ [titan] 2026-01-06 21:13:16,244 - root - INFO - lr: 2.4219e-05 gnorm: 7.56 [ 0:56:00<1 day, 22:04:19]
423
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - step: 62 loss: 7.6777 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
424
+ [titan] 2026-01-06 21:13:57,793 - root - INFO - lr: 2.4609e-05 gnorm: 5.00 [ 0:56:41<1 day, 21:52:27]
425
+ [titan] 2026-01-06 21:14:39,339 - root - INFO - step: 63 loss: 7.6421 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
426
+ [titan] 2026-01-06 21:14:39,340 - root - INFO - lr: 2.5000e-05 gnorm: 6.81 [ 0:57:23<1 day, 21:40:56]
427
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - step: 64 loss: 7.6401 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
428
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - lr: 2.5391e-05 gnorm: 6.72 [ 0:58:04<1 day, 21:29:45]
429
+ [titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
430
+ [titan] 2026-01-06 21:15:41,936 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
431
+ [titan] 2026-01-06 21:15:41,936 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
432
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - step: 65 loss: 7.6475 memory: 71.94GiB(90.77%) tps: 1,051 tflops: 96.13 mfu: 30.81%
433
+ [titan] 2026-01-06 21:16:23,249 - root - INFO - lr: 2.5781e-05 gnorm: 5.00 [ 0:59:07<1 day, 21:34:57]
434
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - step: 66 loss: 7.7008 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.69 mfu: 46.38%
435
+ [titan] 2026-01-06 21:17:04,689 - root - INFO - lr: 2.6172e-05 gnorm: 9.69 [ 0:59:48<1 day, 21:24:04]
436
+ [titan] 2026-01-06 21:17:46,152 - root - INFO - step: 67 loss: 7.6772 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
437
+ [titan] 2026-01-06 21:17:46,153 - root - INFO - lr: 2.6563e-05 gnorm: 8.06 [ 1:00:30<1 day, 21:13:31]
438
+ [titan] 2026-01-06 21:18:27,650 - root - INFO - step: 68 loss: 7.6251 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.49 mfu: 46.31%
439
+ [titan] 2026-01-06 21:18:27,651 - root - INFO - lr: 2.6953e-05 gnorm: 7.88 [ 1:01:11<1 day, 21:03:16]
440
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - step: 69 loss: 7.6183 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.43 mfu: 46.29%
441
+ [titan] 2026-01-06 21:19:09,166 - root - INFO - lr: 2.7344e-05 gnorm: 4.00 [ 1:01:53<1 day, 20:53:19]
442
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - step: 70 loss: 7.6535 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
443
+ [titan] 2026-01-06 21:19:50,686 - root - INFO - lr: 2.7734e-05 gnorm: 17.75 [ 1:02:34<1 day, 20:43:38]
444
+ [titan] 2026-01-06 21:20:32,220 - root - INFO - step: 71 loss: 7.6713 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
445
+ [titan] 2026-01-06 21:20:32,221 - root - INFO - lr: 2.8125e-05 gnorm: 15.69 [ 1:03:16<1 day, 20:34:13]
446
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - step: 72 loss: 7.5969 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
447
+ [titan] 2026-01-06 21:21:13,759 - root - INFO - lr: 2.8516e-05 gnorm: 5.00 [ 1:03:57<1 day, 20:25:03]
448
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - step: 73 loss: 7.6514 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
449
+ [titan] 2026-01-06 21:21:55,296 - root - INFO - lr: 2.8906e-05 gnorm: 7.84 [ 1:04:39<1 day, 20:16:06]
450
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - step: 74 loss: 7.6118 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
451
+ [titan] 2026-01-06 21:22:36,834 - root - INFO - lr: 2.9297e-05 gnorm: 5.53 [ 1:05:20<1 day, 20:07:23]
452
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - step: 75 loss: 7.6545 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.26%
453
+ [titan] 2026-01-06 21:23:18,373 - root - INFO - lr: 2.9687e-05 gnorm: 14.88 [ 1:06:02<1 day, 19:58:53]
454
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - step: 76 loss: 7.6091 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
455
+ [titan] 2026-01-06 21:23:59,909 - root - INFO - lr: 3.0078e-05 gnorm: 15.25 [ 1:06:43<1 day, 19:50:34]
456
+ [titan] 2026-01-06 21:24:41,441 - root - INFO - step: 77 loss: 7.5815 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
457
+ [titan] 2026-01-06 21:24:41,442 - root - INFO - lr: 3.0469e-05 gnorm: 4.84 [ 1:07:25<1 day, 19:42:28]
458
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - step: 78 loss: 7.6119 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
459
+ [titan] 2026-01-06 21:25:22,983 - root - INFO - lr: 3.0859e-05 gnorm: 9.06 [ 1:08:06<1 day, 19:34:33]
460
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - step: 79 loss: 7.6418 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
461
+ [titan] 2026-01-06 21:26:04,516 - root - INFO - lr: 3.1250e-05 gnorm: 8.25 [ 1:08:48<1 day, 19:26:50]
462
+ [titan] 2026-01-06 21:26:46,049 - root - INFO - step: 80 loss: 7.5575 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
463
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - lr: 3.1641e-05 gnorm: 6.97 [ 1:09:29<1 day, 19:19:16]
464
+ [titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
465
+ [titan] 2026-01-06 21:27:08,317 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
466
+ [titan] 2026-01-06 21:27:08,317 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
467
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - step: 81 loss: 7.6005 memory: 71.94GiB(90.77%) tps: 1,030 tflops: 94.22 mfu: 30.20%
468
+ [titan] 2026-01-06 21:27:49,686 - root - INFO - lr: 3.2031e-05 gnorm: 7.19 [ 1:10:33<1 day, 19:25:29]
469
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - step: 82 loss: 7.5774 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
470
+ [titan] 2026-01-06 21:28:31,108 - root - INFO - lr: 3.2422e-05 gnorm: 5.62 [ 1:11:15<1 day, 19:18:01]
471
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - step: 83 loss: 7.6207 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.67 mfu: 46.37%
472
+ [titan] 2026-01-06 21:29:12,555 - root - INFO - lr: 3.2813e-05 gnorm: 4.69 [ 1:11:56<1 day, 19:10:44]
473
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - step: 84 loss: 7.5734 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.59 mfu: 46.34%
474
+ [titan] 2026-01-06 21:29:54,024 - root - INFO - lr: 3.3203e-05 gnorm: 10.75 [ 1:12:37<1 day, 19:03:37]
475
+ [titan] 2026-01-06 21:30:35,519 - root - INFO - step: 85 loss: 7.5241 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.50 mfu: 46.31%
476
+ [titan] 2026-01-06 21:30:35,520 - root - INFO - lr: 3.3594e-05 gnorm: 8.69 [ 1:13:19<1 day, 18:56:41]
477
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - step: 86 loss: 7.5827 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
478
+ [titan] 2026-01-06 21:31:17,030 - root - INFO - lr: 3.3984e-05 gnorm: 7.22 [ 1:14:00<1 day, 18:49:53]
479
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - step: 87 loss: 7.5505 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.44 mfu: 46.29%
480
+ [titan] 2026-01-06 21:31:58,543 - root - INFO - lr: 3.4375e-05 gnorm: 7.91 [ 1:14:42<1 day, 18:43:14]
481
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - step: 88 loss: 7.5143 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
482
+ [titan] 2026-01-06 21:32:40,071 - root - INFO - lr: 3.4766e-05 gnorm: 8.00 [ 1:15:23<1 day, 18:36:44]
483
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - step: 89 loss: 7.5199 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
484
+ [titan] 2026-01-06 21:33:21,599 - root - INFO - lr: 3.5156e-05 gnorm: 8.62 [ 1:16:05<1 day, 18:30:21]
485
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - step: 90 loss: 7.4785 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.40 mfu: 46.28%
486
+ [titan] 2026-01-06 21:34:03,122 - root - INFO - lr: 3.5547e-05 gnorm: 8.12 [ 1:16:47<1 day, 18:24:06]
487
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - step: 91 loss: 7.5003 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
488
+ [titan] 2026-01-06 21:34:44,655 - root - INFO - lr: 3.5937e-05 gnorm: 6.97 [ 1:17:28<1 day, 18:17:58]
489
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - step: 92 loss: 7.5113 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
490
+ [titan] 2026-01-06 21:35:26,183 - root - INFO - lr: 3.6328e-05 gnorm: 10.19 [ 1:18:10<1 day, 18:11:58]
491
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - step: 93 loss: 7.4875 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
492
+ [titan] 2026-01-06 21:36:07,712 - root - INFO - lr: 3.6719e-05 gnorm: 4.59 [ 1:18:51<1 day, 18:06:04]
493
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - step: 94 loss: 7.8691 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
494
+ [titan] 2026-01-06 21:36:49,202 - root - INFO - lr: 3.7109e-05 gnorm: 86.50 [ 1:19:33<1 day, 18:00:16]
495
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - step: 95 loss: 7.7993 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
496
+ [titan] 2026-01-06 21:37:30,710 - root - INFO - lr: 3.7500e-05 gnorm: 62.50 [ 1:20:14<1 day, 17:54:34]
497
+ [titan] 2026-01-06 21:38:12,247 - root - INFO - step: 96 loss: 7.6230 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
498
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - lr: 3.7891e-05 gnorm: 17.38 [ 1:20:56<1 day, 17:49:00]
499
+ [titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
500
+ [titan] 2026-01-06 21:38:32,938 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
501
+ [titan] 2026-01-06 21:38:32,938 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.69 seconds.
502
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - step: 97 loss: 7.5778 memory: 71.94GiB(90.77%) tps: 1,057 tflops: 96.68 mfu: 30.99%
503
+ [titan] 2026-01-06 21:39:14,269 - root - INFO - lr: 3.8281e-05 gnorm: 17.75 [ 1:21:58<1 day, 17:54:00]
504
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - step: 98 loss: 7.5438 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
505
+ [titan] 2026-01-06 21:39:55,690 - root - INFO - lr: 3.8672e-05 gnorm: 11.75 [ 1:22:39<1 day, 17:48:28]
506
+ [titan] 2026-01-06 21:40:37,179 - root - INFO - step: 99 loss: 7.5091 memory: 71.94GiB(90.77%) tps: 1,580 tflops: 144.52 mfu: 46.32%
507
+ [titan] 2026-01-06 21:40:37,180 - root - INFO - lr: 3.9063e-05 gnorm: 7.81 [ 1:23:21<1 day, 17:43:04]
508
+ [titan] 2026-01-06 21:40:37,202 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
509
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - step: 100 loss: 7.4961 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
510
+ [titan] 2026-01-06 21:41:18,706 - root - INFO - lr: 3.9453e-05 gnorm: 7.59 [ 1:24:02<1 day, 17:37:46]
511
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - step: 101 loss: 7.4848 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.28%
512
+ [titan] 2026-01-06 21:42:00,228 - root - INFO - lr: 3.9844e-05 gnorm: 5.97 [ 1:24:44<1 day, 17:32:34]
513
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - step: 102 loss: 7.5118 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.45 mfu: 46.30%
514
+ [titan] 2026-01-06 21:42:41,739 - root - INFO - lr: 4.0234e-05 gnorm: 8.06 [ 1:25:25<1 day, 17:27:26]
515
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - step: 103 loss: 7.4788 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
516
+ [titan] 2026-01-06 21:43:23,265 - root - INFO - lr: 4.0625e-05 gnorm: 10.06 [ 1:26:07<1 day, 17:22:24]
517
+ [titan] 2026-01-06 21:44:04,785 - root - INFO - step: 104 loss: 7.4560 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.41 mfu: 46.29%
518
+ [titan] 2026-01-06 21:44:04,786 - root - INFO - lr: 4.1016e-05 gnorm: 9.50 [ 1:26:48<1 day, 17:17:27]
519
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - step: 105 loss: 7.4534 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
520
+ [titan] 2026-01-06 21:44:46,319 - root - INFO - lr: 4.1406e-05 gnorm: 8.44 [ 1:27:30<1 day, 17:12:36]
521
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - step: 106 loss: 7.4770 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
522
+ [titan] 2026-01-06 21:45:27,838 - root - INFO - lr: 4.1797e-05 gnorm: 10.56 [ 1:28:11<1 day, 17:07:48]
523
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - step: 107 loss: 7.4382 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
524
+ [titan] 2026-01-06 21:46:09,374 - root - INFO - lr: 4.2188e-05 gnorm: 13.69 [ 1:28:53<1 day, 17:03:06]
525
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - step: 108 loss: 7.4561 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.39 mfu: 46.28%
526
+ [titan] 2026-01-06 21:46:50,902 - root - INFO - lr: 4.2578e-05 gnorm: 8.69 [ 1:29:34<1 day, 16:58:28]
527
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - step: 109 loss: 7.3967 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
528
+ [titan] 2026-01-06 21:47:32,443 - root - INFO - lr: 4.2969e-05 gnorm: 7.31 [ 1:30:16<1 day, 16:53:55]
529
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - step: 110 loss: 7.4334 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
530
+ [titan] 2026-01-06 21:48:13,976 - root - INFO - lr: 4.3359e-05 gnorm: 25.38 [ 1:30:57<1 day, 16:49:25]
531
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - step: 111 loss: 7.4360 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
532
+ [titan] 2026-01-06 21:48:55,511 - root - INFO - lr: 4.3750e-05 gnorm: 10.44 [ 1:31:39<1 day, 16:45:00]
533
+ [titan] 2026-01-06 21:49:37,059 - root - INFO - step: 112 loss: 7.5123 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
534
+ [titan] 2026-01-06 21:49:37,060 - root - INFO - lr: 4.4141e-05 gnorm: 16.88 [ 1:32:20<1 day, 16:40:39]
535
+ [titan] 2026-01-06 21:49:37,060 - root - INFO - Saving the checkpoint (or staging if async is enabled).
536
+ [titan] 2026-01-06 21:49:59,578 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
537
+ [titan] 2026-01-06 21:49:59,579 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.52 seconds.
538
+ [titan] 2026-01-06 21:50:40,891 - root - INFO - step: 113 loss: 7.4803 memory: 71.94GiB(90.77%) tps: 1,027 tflops: 93.94 mfu: 30.11%
539
+ [titan] 2026-01-06 21:50:40,892 - root - INFO - lr: 4.4531e-05 gnorm: 13.06 [ 1:33:24<1 day, 16:46:06]
540
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - step: 114 loss: 7.4859 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.79 mfu: 46.41%
541
+ [titan] 2026-01-06 21:51:22,305 - root - INFO - lr: 4.4922e-05 gnorm: 16.50 [ 1:34:06<1 day, 16:41:44]
542
+ [titan] 2026-01-06 21:52:03,747 - root - INFO - step: 115 loss: 7.4151 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.68 mfu: 46.37%
543
+ [titan] 2026-01-06 21:52:03,748 - root - INFO - lr: 4.5313e-05 gnorm: 13.94 [ 1:34:47<1 day, 16:37:26]
544
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - step: 116 loss: 7.3814 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.47 mfu: 46.30%
545
+ [titan] 2026-01-06 21:52:45,252 - root - INFO - lr: 4.5703e-05 gnorm: 11.69 [ 1:35:29<1 day, 16:33:14]
546
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - step: 117 loss: 7.4033 memory: 71.94GiB(90.77%) tps: 1,579 tflops: 144.46 mfu: 46.30%
547
+ [titan] 2026-01-06 21:53:26,760 - root - INFO - lr: 4.6094e-05 gnorm: 9.31 [ 1:36:10<1 day, 16:29:06]
548
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - step: 118 loss: 7.4721 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
549
+ [titan] 2026-01-06 21:54:08,279 - root - INFO - lr: 4.6484e-05 gnorm: 20.88 [ 1:36:52<1 day, 16:25:01]
550
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - step: 119 loss: 7.4258 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
551
+ [titan] 2026-01-06 21:54:49,813 - root - INFO - lr: 4.6875e-05 gnorm: 16.62 [ 1:37:33<1 day, 16:21:00]
552
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - step: 120 loss: 7.3951 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
553
+ [titan] 2026-01-06 21:55:31,360 - root - INFO - lr: 4.7266e-05 gnorm: 11.38 [ 1:38:15<1 day, 16:17:03]
554
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - step: 121 loss: 7.3984 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
555
+ [titan] 2026-01-06 21:56:12,904 - root - INFO - lr: 4.7656e-05 gnorm: 10.19 [ 1:38:56<1 day, 16:13:09]
556
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - step: 122 loss: 7.5098 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
557
+ [titan] 2026-01-06 21:56:54,444 - root - INFO - lr: 4.8047e-05 gnorm: 19.38 [ 1:39:38<1 day, 16:09:18]
558
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - step: 123 loss: 7.4071 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
559
+ [titan] 2026-01-06 21:57:35,983 - root - INFO - lr: 4.8438e-05 gnorm: 13.25 [ 1:40:19<1 day, 16:05:30]
560
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - step: 124 loss: 7.4271 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
561
+ [titan] 2026-01-06 21:58:17,525 - root - INFO - lr: 4.8828e-05 gnorm: 11.88 [ 1:41:01<1 day, 16:01:45]
562
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - step: 125 loss: 7.3603 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.31 mfu: 46.25%
563
+ [titan] 2026-01-06 21:58:59,075 - root - INFO - lr: 4.9219e-05 gnorm: 11.50 [ 1:41:42<1 day, 15:58:03]
564
+ [titan] 2026-01-06 21:59:40,618 - root - INFO - step: 126 loss: 7.3625 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.33 mfu: 46.26%
565
+ [titan] 2026-01-06 21:59:40,619 - root - INFO - lr: 4.9609e-05 gnorm: 9.88 [ 1:42:24<1 day, 15:54:24]
566
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - step: 127 loss: 7.3691 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.36 mfu: 46.27%
567
+ [titan] 2026-01-06 22:00:22,155 - root - INFO - lr: 5.0000e-05 gnorm: 11.88 [ 1:43:06<1 day, 15:50:48]
568
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - step: 128 loss: 7.3331 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.35 mfu: 46.27%
569
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - lr: 5.0391e-05 gnorm: 11.56 [ 1:43:47<1 day, 15:47:14]
570
+ [titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
571
+ [titan] 2026-01-06 22:01:24,082 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
572
+ [titan] 2026-01-06 22:01:24,082 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.39 seconds.
573
+ [titan] 2026-01-06 22:02:05,453 - root - INFO - step: 129 loss: 7.2878 memory: 71.94GiB(90.77%) tps: 1,061 tflops: 97.09 mfu: 31.12%
574
+ [titan] 2026-01-06 22:02:05,454 - root - INFO - lr: 5.0781e-05 gnorm: 6.16 [ 1:44:49<1 day, 15:51:24]
575
+ [titan] 2026-01-06 22:02:46,875 - root - INFO - step: 130 loss: 7.7017 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.76 mfu: 46.40%
576
+ [titan] 2026-01-06 22:02:46,876 - root - INFO - lr: 5.1172e-05 gnorm: 70.00 [ 1:45:30<1 day, 15:47:50]
577
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - step: 131 loss: 7.5220 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.61 mfu: 46.35%
578
+ [titan] 2026-01-06 22:03:28,339 - root - INFO - lr: 5.1562e-05 gnorm: 44.75 [ 1:46:12<1 day, 15:44:19]
579
+ [titan] 2026-01-06 22:04:09,858 - root - INFO - step: 132 loss: 7.4566 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.42 mfu: 46.29%
580
+ [titan] 2026-01-06 22:04:09,859 - root - INFO - lr: 5.1953e-05 gnorm: 13.50 [ 1:46:53<1 day, 15:40:51]
581
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - step: 133 loss: 7.4026 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.38 mfu: 46.28%
582
+ [titan] 2026-01-06 22:04:51,387 - root - INFO - lr: 5.2344e-05 gnorm: 10.12 [ 1:47:35<1 day, 15:37:27]
583
+ [titan] 2026-01-06 22:05:32,919 - root - INFO - step: 134 loss: 7.4092 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
584
+ [titan] 2026-01-06 22:05:32,920 - root - INFO - lr: 5.2734e-05 gnorm: 14.88 [ 1:48:16<1 day, 15:34:04]
585
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - step: 135 loss: 7.3827 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
586
+ [titan] 2026-01-06 22:06:14,471 - root - INFO - lr: 5.3125e-05 gnorm: 18.88 [ 1:48:58<1 day, 15:30:45]
587
+ [titan] 2026-01-06 22:06:56,027 - root - INFO - step: 136 loss: 7.4021 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
588
+ [titan] 2026-01-06 22:06:56,028 - root - INFO - lr: 5.3516e-05 gnorm: 12.81 [ 1:49:39<1 day, 15:27:28]
589
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - step: 137 loss: 7.4064 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
590
+ [titan] 2026-01-06 22:07:37,581 - root - INFO - lr: 5.3906e-05 gnorm: 7.19 [ 1:50:21<1 day, 15:24:14]
591
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - step: 138 loss: 7.4774 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.32 mfu: 46.26%
592
+ [titan] 2026-01-06 22:08:19,129 - root - INFO - lr: 5.4297e-05 gnorm: 22.62 [ 1:51:03<1 day, 15:21:01]
593
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - step: 139 loss: 7.4281 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
594
+ [titan] 2026-01-06 22:09:00,688 - root - INFO - lr: 5.4688e-05 gnorm: 11.00 [ 1:51:44<1 day, 15:17:51]
595
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - step: 140 loss: 7.5633 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.34 mfu: 46.26%
596
+ [titan] 2026-01-06 22:09:42,228 - root - INFO - lr: 5.5078e-05 gnorm: 19.75 [ 1:52:26<1 day, 15:14:42]
597
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - step: 141 loss: 7.5423 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.27 mfu: 46.24%
598
+ [titan] 2026-01-06 22:10:23,790 - root - INFO - lr: 5.5469e-05 gnorm: 17.25 [ 1:53:07<1 day, 15:11:37]
599
+ [titan] 2026-01-06 22:11:05,350 - root - INFO - step: 142 loss: 7.4047 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.28 mfu: 46.24%
600
+ [titan] 2026-01-06 22:11:05,351 - root - INFO - lr: 5.5859e-05 gnorm: 9.94 [ 1:53:49<1 day, 15:08:33]
601
+ [titan] 2026-01-06 22:11:46,904 - root - INFO - step: 143 loss: 7.5261 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.30 mfu: 46.25%
602
+ [titan] 2026-01-06 22:11:46,905 - root - INFO - lr: 5.6250e-05 gnorm: 25.75 [ 1:54:30<1 day, 15:05:31]
603
+ [titan] 2026-01-06 22:12:28,460 - root - INFO - step: 144 loss: 7.4217 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.29 mfu: 46.25%
604
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - lr: 5.6641e-05 gnorm: 18.00 [ 1:55:12<1 day, 15:02:31]
605
+ [titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
606
+ [titan] 2026-01-06 22:12:50,183 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
607
+ [titan] 2026-01-06 22:12:50,183 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.72 seconds.
608
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - step: 145 loss: 7.3958 memory: 71.94GiB(90.77%) tps: 1,039 tflops: 95.10 mfu: 30.48%
609
+ [titan] 2026-01-06 22:13:31,510 - root - INFO - lr: 5.7031e-05 gnorm: 11.69 [ 1:56:15<1 day, 15:06:46]
610
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - step: 146 loss: 7.4073 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.71 mfu: 46.38%
611
+ [titan] 2026-01-06 22:14:12,944 - root - INFO - lr: 5.7422e-05 gnorm: 11.25 [ 1:56:56<1 day, 15:03:44]
612
+ [titan] 2026-01-06 22:14:54,370 - root - INFO - step: 147 loss: 7.3301 memory: 71.94GiB(90.77%) tps: 1,582 tflops: 144.74 mfu: 46.39%
613
+ [titan] 2026-01-06 22:14:54,371 - root - INFO - lr: 5.7813e-05 gnorm: 7.34 [ 1:57:38<1 day, 15:00:44]
614
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - step: 148 loss: 7.3624 memory: 71.94GiB(90.77%) tps: 1,581 tflops: 144.64 mfu: 46.36%
615
+ [titan] 2026-01-06 22:15:35,825 - root - INFO - lr: 5.8203e-05 gnorm: 17.38 [ 1:58:19<1 day, 14:57:47]
616
+ [titan] 2026-01-06 22:16:17,356 - root - INFO - step: 149 loss: 7.2913 memory: 71.94GiB(90.77%) tps: 1,578 tflops: 144.37 mfu: 46.27%
617
+ [titan] 2026-01-06 22:16:17,357 - root - INFO - lr: 5.8594e-05 gnorm: 3.80 [ 1:59:01<1 day, 14:54:52]
618
+ [titan] 2026-01-06 22:16:17,388 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
619
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - step: 150 loss: 7.3146 memory: 71.94GiB(90.77%) tps: 1,577 tflops: 144.25 mfu: 46.23%
620
+ [titan] 2026-01-06 22:16:58,923 - root - INFO - lr: 5.8984e-05 gnorm: 7.06 [ 1:59:42<1 day, 14:52:01]
logs/none_4cvjdbqa/attempt_0/7/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/0/stderr.log ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,365 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,367 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,369 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,369 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,369 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,424 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,765 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,765 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,765 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,326 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,479 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,534 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,534 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,676 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,180 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,181 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,332 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,332 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:18,265 - root - ERROR - Failed to create WandB logger: No API key configured. Use `wandb login` to log in.
275
+ [titan] 2026-01-02 12:24:18,271 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
276
+ [titan] 2026-01-02 12:24:18,274 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
277
+ [titan] 2026-01-02 12:24:18,506 - root - INFO - Mixed precision training is handled by fully_shard
278
+ [titan] 2026-01-02 12:24:18,507 - root - INFO - ***** Running training *****
279
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Training starts at step 1
280
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Number of tokens per sequence = 2,048
281
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Gradient Accumulation steps = 16
282
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Instantaneous batch size (per device) = 2
283
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
284
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
285
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
286
+ [titan] 2026-01-02 12:24:18,507 - root - INFO -  Number of parameters = 14,409,815,040 
287
+ [titan] 2026-01-02 12:24:18,508 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
288
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
289
+ torch._dynamo.utils.warn_once(msg)
290
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
291
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
292
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
293
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 157 tflops: 14.35 mfu: 4.60%
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:06:57<148 days, 12:29:41]
296
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
297
+ [titan] 2026-01-02 12:31:45,774 - root - INFO - [GC] GC collection invoked by checkpointer. 0.40 seconds.
298
+ [titan] 2026-01-02 12:31:45,775 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.77 seconds.
299
+ [titan] 2026-01-02 12:31:45,775 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
300
+ [titan] 2026-01-02 12:32:27,287 - root - INFO - step: 2 loss: 14.3989 memory: 69.12GiB(87.21%) tps: 919 tflops: 84.11 mfu: 26.96%
301
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:09<86 days, 22:18:58]
302
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.12GiB(87.21%) tps: 1,580 tflops: 144.59 mfu: 46.34%
303
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:50<62 days, 20:46:52]
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.12GiB(87.21%) tps: 1,578 tflops: 144.34 mfu: 46.26%
305
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:32<50 days, 20:09:32]
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.12GiB(87.21%) tps: 1,575 tflops: 144.14 mfu: 46.20%
307
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:13<43 days, 15:04:36]
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.12GiB(87.21%) tps: 1,575 tflops: 144.10 mfu: 46.18%
309
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:10:55<38 days, 19:42:14]
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.12GiB(87.21%) tps: 1,574 tflops: 143.98 mfu: 46.15%
311
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:36<35 days, 9:20:01]
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.12GiB(87.21%) tps: 1,574 tflops: 144.01 mfu: 46.16%
313
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:18<32 days, 19:32:37]
314
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.12GiB(87.21%) tps: 1,573 tflops: 143.92 mfu: 46.13%
315
+ [titan] 2026-01-02 12:37:18,456 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:00<30 days, 19:30:24]
316
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.12GiB(87.21%) tps: 1,573 tflops: 143.95 mfu: 46.14%
317
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:41<29 days, 5:04:02]
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.12GiB(87.21%) tps: 1,572 tflops: 143.86 mfu: 46.11%
319
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:23<27 days, 21:38:05]
320
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.12GiB(87.21%) tps: 1,573 tflops: 143.93 mfu: 46.13%
321
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:05<26 days, 19:25:26]
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.12GiB(87.21%) tps: 1,572 tflops: 143.85 mfu: 46.11%
323
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:46<25 days, 21:15:36]
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.12GiB(87.21%) tps: 1,573 tflops: 143.91 mfu: 46.13%
325
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:28<25 days, 2:14:58]
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.12GiB(87.21%) tps: 1,572 tflops: 143.86 mfu: 46.11%
327
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:10<24 days, 9:46:51]
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.12GiB(87.21%) tps: 1,573 tflops: 143.94 mfu: 46.13%
329
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:51<23 days, 19:21:25]
330
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.12GiB(87.21%) tps: 1,572 tflops: 143.86 mfu: 46.11%
331
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:33<23 days, 6:38:27]
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.12GiB(87.21%) tps: 1,574 tflops: 143.98 mfu: 46.15%
333
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:15<22 days, 19:19:10]
logs/none_rci5peh0/attempt_0/1/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,075 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,075 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,076 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,372 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,376 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,378 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,378 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,378 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,424 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,768 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,768 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,768 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,480 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,537 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,537 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,538 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,682 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,137 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,184 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,184 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,331 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,349 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,553 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,553 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,553 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,553 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,554 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,554 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,554 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,554 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,554 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,554 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,554 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:34:10]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:45,818 - root - INFO - [GC] GC collection invoked by checkpointer. 0.44 seconds.
297
+ [titan] 2026-01-02 12:31:45,818 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.82 seconds.
298
+ [titan] 2026-01-02 12:31:45,819 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,287 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:10]
301
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:16]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:33]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:23]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:32:51]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:14]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:32]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:07]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:22]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:12]
319
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:42]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.10%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:09]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:03]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:02]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:50]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:50]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:35:57]
logs/none_rci5peh0/attempt_0/1/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/2/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,343 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,356 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,419 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,761 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,761 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,761 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,479 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,536 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,133 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,180 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,181 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,348 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,564 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,565 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,566 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:34:19]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:45,830 - root - INFO - [GC] GC collection invoked by checkpointer. 0.45 seconds.
297
+ [titan] 2026-01-02 12:31:45,830 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.83 seconds.
298
+ [titan] 2026-01-02 12:31:45,830 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,287 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:14]
301
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:19]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:35]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:25]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:32:53]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:17]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:35]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:10]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:25]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:16]
319
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:45]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.11%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:11]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:05]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:04]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:52]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,816 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:52]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:36:00]
logs/none_rci5peh0/attempt_0/2/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/3/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,356 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,358 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,360 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,360 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,360 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,419 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,765 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,765 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,765 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,326 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,479 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,536 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,181 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,182 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,348 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,564 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,565 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,565 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,565 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:34:11]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:45,805 - root - INFO - [GC] GC collection invoked by checkpointer. 0.42 seconds.
297
+ [titan] 2026-01-02 12:31:45,806 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.80 seconds.
298
+ [titan] 2026-01-02 12:31:45,806 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:11]
301
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:17]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:33]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:23]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:32:52]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:15]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:33]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,456 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:08]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:23]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:13]
319
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:43]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.10%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:10]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:03]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:02]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:50]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:50]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:35:58]
logs/none_rci5peh0/attempt_0/3/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/4/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,074 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,345 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,419 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,768 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,768 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,768 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,478 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,536 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,182 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,182 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,331 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,349 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,560 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,561 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,561 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,562 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:34:08]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:45,811 - root - INFO - [GC] GC collection invoked by checkpointer. 0.43 seconds.
297
+ [titan] 2026-01-02 12:31:45,811 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.81 seconds.
298
+ [titan] 2026-01-02 12:31:45,811 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:08]
301
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:15]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:32]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:21]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:32:50]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:13]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:31]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,456 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:05]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:21]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:11]
319
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:40]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.11%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:07]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:00]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:00]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:48]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,816 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:47]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:35:55]
logs/none_rci5peh0/attempt_0/4/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/5/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,074 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,074 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,075 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,342 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,421 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,761 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,761 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,761 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,480 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,540 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,540 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,541 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,681 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,136 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,183 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,183 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,348 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,534 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,535 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,535 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,535 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:34:27]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:45,842 - root - INFO - [GC] GC collection invoked by checkpointer. 0.46 seconds.
297
+ [titan] 2026-01-02 12:31:45,842 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.84 seconds.
298
+ [titan] 2026-01-02 12:31:45,842 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,287 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:19]
301
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:22]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:38]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:26]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:32:54]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:18]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:35]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,456 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:10]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:25]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:15]
319
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:44]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.10%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:11]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:04]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:04]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:52]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:52]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:35:59]
logs/none_rci5peh0/attempt_0/5/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/6/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,074 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,360 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,363 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,365 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,365 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,366 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,423 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,777 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,777 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,777 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,326 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,481 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,538 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,538 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,539 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,678 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,136 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,182 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,183 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,349 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,619 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,620 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,620 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,620 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,002 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:34:08]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:46,019 - root - INFO - [GC] GC collection invoked by checkpointer. 0.63 seconds.
297
+ [titan] 2026-01-02 12:31:46,019 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 30.02 seconds.
298
+ [titan] 2026-01-02 12:31:46,020 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:12]
301
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:18]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:33]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:22]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:32:51]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:15]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:33]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:08]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,110 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:23]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:14]
319
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,449 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:44]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.11%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:10]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:04]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:03]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:51]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:51]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:35:58]
logs/none_rci5peh0/attempt_0/6/stdout.log ADDED
File without changes
logs/none_rci5peh0/attempt_0/7/stderr.log ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
2
+ [titan] 2026-01-02 12:21:12,073 - root - INFO - {
3
+ "activation_checkpoint": {
4
+ "mode": "none",
5
+ "selective_ac_option": "2"
6
+ },
7
+ "activation_offload": {
8
+ "mode": "none"
9
+ },
10
+ "checkpoint": {
11
+ "async_mode": "disabled",
12
+ "create_seed_checkpoint": false,
13
+ "enable_checkpoint": true,
14
+ "exclude_from_loading": [],
15
+ "export_dtype": "float32",
16
+ "folder": "checkpoint",
17
+ "initial_load_model_weights_only": true,
18
+ "initial_load_path": null,
19
+ "interval": 3072,
20
+ "interval_type": "steps",
21
+ "keep_latest_k": 0,
22
+ "last_save_model_weights_only": false,
23
+ "load_step": -1,
24
+ "model_weights_only": false
25
+ },
26
+ "comm": {
27
+ "init_timeout_seconds": 300,
28
+ "trace_buf_size": 20000,
29
+ "train_timeout_seconds": 100
30
+ },
31
+ "experimental": {
32
+ "context_parallel_degree": 1,
33
+ "context_parallel_rotate_method": "allgather",
34
+ "custom_model_path": "",
35
+ "enable_async_tensor_parallel": false,
36
+ "enable_compiled_autograd": false,
37
+ "pipeline_parallel_degree": 1,
38
+ "pipeline_parallel_microbatches": null,
39
+ "pipeline_parallel_schedule": "1F1B",
40
+ "pipeline_parallel_schedule_csv": "",
41
+ "pipeline_parallel_split_points": []
42
+ },
43
+ "fault_tolerance": {
44
+ "enable": false,
45
+ "group_size": 0,
46
+ "min_replica_size": 1,
47
+ "replica_id": 0
48
+ },
49
+ "float8": {
50
+ "enable_fsdp_float8_all_gather": false,
51
+ "force_recompute_fp8_weight_in_bwd": false,
52
+ "precompute_float8_dynamic_scale_for_fsdp": false,
53
+ "recipe_name": null
54
+ },
55
+ "job": {
56
+ "config_file": "flame/models/fla.toml",
57
+ "description": "default job",
58
+ "dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
59
+ "print_args": true,
60
+ "use_for_integration_test": false
61
+ },
62
+ "lr_scheduler": {
63
+ "decay_ratio": null,
64
+ "decay_type": "cosine",
65
+ "lr_min": 0.1,
66
+ "warmup_steps": 1024
67
+ },
68
+ "memory_estimation": {
69
+ "disable_fake_mode": false,
70
+ "enabled": false
71
+ },
72
+ "metrics": {
73
+ "disable_color_printing": false,
74
+ "enable_tensorboard": false,
75
+ "enable_wandb": true,
76
+ "log_freq": 1,
77
+ "save_for_all_ranks": false,
78
+ "save_tb_folder": "tb"
79
+ },
80
+ "model": {
81
+ "config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
82
+ "converters": [],
83
+ "name": "fla",
84
+ "print_after_conversion": false,
85
+ "tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
86
+ },
87
+ "optimizer": {
88
+ "beta1": 0.9,
89
+ "beta2": 0.95,
90
+ "early_step_in_backward": false,
91
+ "eps": 1e-15,
92
+ "implementation": "fused",
93
+ "lr": 0.0004,
94
+ "name": "AdamW",
95
+ "weight_decay": 0.1
96
+ },
97
+ "profiling": {
98
+ "enable_memory_snapshot": false,
99
+ "enable_profiling": true,
100
+ "profile_freq": 512,
101
+ "save_memory_snapshot_folder": "memory_snapshot",
102
+ "save_traces_folder": "profile_trace"
103
+ },
104
+ "training": {
105
+ "batch_size": 2,
106
+ "compile": true,
107
+ "context_len": 2048,
108
+ "data_dir": null,
109
+ "data_files": null,
110
+ "data_parallel_replicate_degree": 1,
111
+ "data_parallel_shard_degree": 8,
112
+ "data_probs": null,
113
+ "dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
114
+ "dataset_name": "default",
115
+ "dataset_split": "train",
116
+ "deterministic": false,
117
+ "disable_loss_parallel": true,
118
+ "enable_cpu_offload": false,
119
+ "fsdp_reshard_after_forward": "default",
120
+ "gc_freq": 50,
121
+ "gradient_accumulation_steps": 16,
122
+ "max_norm": 1.0,
123
+ "mixed_precision_param": "bfloat16",
124
+ "mixed_precision_reduce": "float32",
125
+ "num_workers": 8,
126
+ "persistent_workers": false,
127
+ "pin_memory": false,
128
+ "prefetch_factor": 2,
129
+ "seed": 42,
130
+ "seq_len": 2048,
131
+ "skip_nan_inf": true,
132
+ "steps": 30720,
133
+ "streaming": true,
134
+ "tensor_parallel_degree": 1,
135
+ "varlen": false
136
+ }
137
+ }
138
+ [titan] 2026-01-02 12:21:12,074 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
139
+ [titan] 2026-01-02 12:21:13,346 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
140
+ [titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
141
+ [titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
142
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
143
+ [titan] 2026-01-02 12:21:13,355 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
144
+ [titan] 2026-01-02 12:21:13,423 - root - INFO - Loading tokenizer...
145
+ The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
146
+ [titan] 2026-01-02 12:21:13,766 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
147
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
148
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
149
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
150
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
151
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
152
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
153
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
154
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
155
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
156
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
157
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
158
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
159
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
160
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
161
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
162
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
163
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
164
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
165
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
166
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
167
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
168
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
169
+ 151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
170
+ 151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
171
+ 151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
172
+ 151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
173
+ }
174
+ )
175
+ [titan] 2026-01-02 12:21:13,766 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
176
+ `trust_remote_code` is not supported anymore.
177
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
178
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
179
+ [titan] 2026-01-02 12:21:13,766 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
180
+ Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
181
+ If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
182
+ [titan] 2026-01-02 12:21:14,324 - root - INFO - Shuffling the dataset with seed 42
183
+ [titan] 2026-01-02 12:21:14,326 - root - INFO - IterableDataset({
184
+ features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
185
+ num_shards: 360
186
+ })
187
+ [titan] 2026-01-02 12:21:14,326 - root - INFO - Building dataloader...
188
+ [titan] 2026-01-02 12:21:14,328 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
189
+ [titan] 2026-01-02 12:21:14,329 - root - INFO - Building model from the config
190
+ GSAConfig {
191
+ "architectures": [
192
+ "GSAForCausalLM"
193
+ ],
194
+ "attn": null,
195
+ "bos_token_id": 151643,
196
+ "clamp_max": null,
197
+ "clamp_min": null,
198
+ "conv_size": 4,
199
+ "dtype": "bfloat16",
200
+ "elementwise_affine": false,
201
+ "eos_token_id": 151645,
202
+ "expand_k": 1,
203
+ "expand_v": 1,
204
+ "feature_map": "swish",
205
+ "fuse_cross_entropy": true,
206
+ "fuse_linear_cross_entropy": false,
207
+ "fuse_norm": true,
208
+ "fuse_swiglu": true,
209
+ "gate_logit_normalizer": 8,
210
+ "hidden_act": "swish",
211
+ "hidden_ratio": 4,
212
+ "hidden_size": 5120,
213
+ "initializer_range": 0.02,
214
+ "intermediate_size": 17408,
215
+ "max_position_embeddings": 40960,
216
+ "model_type": "gsa",
217
+ "norm_eps": 1e-06,
218
+ "num_heads": 40,
219
+ "num_hidden_layers": 40,
220
+ "num_kv_heads": 8,
221
+ "num_slots": 256,
222
+ "rope_theta": 1000000,
223
+ "share_conv_kernel": true,
224
+ "tie_word_embeddings": true,
225
+ "transformers_version": "4.57.3",
226
+ "use_cache": true,
227
+ "use_l2warp": false,
228
+ "use_norm": true,
229
+ "use_output_gate": true,
230
+ "use_rope": false,
231
+ "use_short_conv": false,
232
+ "vocab_size": 151936
233
+ }
234
+ 
235
+ [titan] 2026-01-02 12:21:14,481 - root - INFO - 
236
+ GSAForCausalLM(
237
+ (model): GSAModel(
238
+ (embeddings): Embedding(151936, 5120)
239
+ (layers): ModuleList(
240
+ (0-39): 40 x GSABlock(
241
+ (attn_norm): RMSNorm(5120, eps=1e-06)
242
+ (attn): GatedSlotAttention(
243
+ (feature_map): SwishFeatureMap()
244
+ (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
245
+ (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
246
+ (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
247
+ (f_proj): Linear(in_features=5120, out_features=2048, bias=False)
248
+ (g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
249
+ (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
250
+ )
251
+ (mlp_norm): RMSNorm(5120, eps=1e-06)
252
+ (mlp): GatedMLP(
253
+ (gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
254
+ (up_proj): Linear(in_features=5120, out_features=17408, bias=False)
255
+ (down_proj): Linear(in_features=17408, out_features=5120, bias=False)
256
+ (swiglu_linear): SwiGLULinear()
257
+ )
258
+ )
259
+ )
260
+ (norm): RMSNorm(5120, eps=1e-06)
261
+ )
262
+ (lm_head): Linear(in_features=5120, out_features=151936, bias=False)
263
+ )
264
+
265
+ [titan] 2026-01-02 12:21:14,539 - root - INFO - Compiling each block with torch.compile
266
+ [titan] 2026-01-02 12:21:14,539 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
267
+ [titan] 2026-01-02 12:21:14,540 - root - INFO - Compiling the entire model with torch.compile
268
+ [titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
269
+ [titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
270
+ [titan] 2026-01-02 12:21:15,182 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
271
+ [titan] 2026-01-02 12:21:15,182 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
272
+ [titan] 2026-01-02 12:24:11,329 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
273
+ [titan] 2026-01-02 12:24:11,330 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
274
+ [titan] 2026-01-02 12:24:11,346 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
275
+ [titan] 2026-01-02 12:24:11,348 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
276
+ [titan] 2026-01-02 12:24:11,536 - root - INFO - Mixed precision training is handled by fully_shard
277
+ [titan] 2026-01-02 12:24:11,537 - root - INFO - ***** Running training *****
278
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Training starts at step 1
279
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Number of tokens per sequence = 2,048
280
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Gradient Accumulation steps = 16
281
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Instantaneous batch size (per device) = 2
282
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
283
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Total optimization steps = 30,720 (16,106,127,360 tokens)
284
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Warmup steps = 1,024 (536,870,912 tokens)
285
+ [titan] 2026-01-02 12:24:11,537 - root - INFO -  Number of parameters = 14,409,815,040 
286
+ [titan] 2026-01-02 12:24:11,537 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
287
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
288
+ torch._dynamo.utils.warn_once(msg)
289
+ /mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
290
+ If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
291
+ If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
292
+ torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
293
+ [titan] 2026-01-02 12:31:16,002 - root - INFO - step: 1 loss: 14.3857 memory: 65.22GiB(82.29%) tps: 154 tflops: 14.12 mfu: 4.53%
294
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - lr: 7.8125e-07 gnorm: 129.00 [ 0:07:04<150 days, 23:35:39]
295
+ [titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
296
+ [titan] 2026-01-02 12:31:45,855 - root - INFO - [GC] GC collection invoked by checkpointer. 0.48 seconds.
297
+ [titan] 2026-01-02 12:31:45,855 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.85 seconds.
298
+ [titan] 2026-01-02 12:31:45,855 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
299
+ [titan] 2026-01-02 12:32:27,287 - root - INFO - step: 2 loss: 14.3989 memory: 69.11GiB(87.20%) tps: 919 tflops: 84.11 mfu: 26.96%
300
+ [titan] 2026-01-02 12:32:27,288 - root - INFO - lr: 1.1719e-06 gnorm: 127.00 [ 0:08:15<88 days, 3:51:56]
301
+ [titan] 2026-01-02 12:33:08,758 - root - INFO - step: 3 loss: 14.3929 memory: 69.11GiB(87.20%) tps: 1,580 tflops: 144.59 mfu: 46.34%
302
+ [titan] 2026-01-02 12:33:08,759 - root - INFO - lr: 1.5625e-06 gnorm: 126.00 [ 0:08:57<63 days, 16:28:48]
303
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - step: 4 loss: 14.2932 memory: 69.11GiB(87.20%) tps: 1,578 tflops: 144.34 mfu: 46.26%
304
+ [titan] 2026-01-02 12:33:50,300 - root - INFO - lr: 1.9531e-06 gnorm: 128.00 [ 0:09:38<51 days, 10:55:57]
305
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - step: 5 loss: 14.2689 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.14 mfu: 46.20%
306
+ [titan] 2026-01-02 12:34:31,898 - root - INFO - lr: 2.3438e-06 gnorm: 124.00 [ 0:10:20<44 days, 2:53:42]
307
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - step: 6 loss: 13.9979 memory: 69.11GiB(87.20%) tps: 1,575 tflops: 144.10 mfu: 46.19%
308
+ [titan] 2026-01-02 12:35:13,509 - root - INFO - lr: 2.7344e-06 gnorm: 117.00 [ 0:11:02<39 days, 5:33:08]
309
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - step: 7 loss: 13.8167 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
310
+ [titan] 2026-01-02 12:35:55,155 - root - INFO - lr: 3.1250e-06 gnorm: 113.00 [ 0:11:43<35 days, 17:46:29]
311
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - step: 8 loss: 13.5683 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 144.01 mfu: 46.16%
312
+ [titan] 2026-01-02 12:36:36,792 - root - INFO - lr: 3.5156e-06 gnorm: 106.50 [ 0:12:25<33 days, 2:55:45]
313
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - step: 9 loss: 13.3760 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.92 mfu: 46.13%
314
+ [titan] 2026-01-02 12:37:18,455 - root - INFO - lr: 3.9063e-06 gnorm: 101.00 [ 0:13:07<31 days, 2:04:18]
315
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - step: 10 loss: 13.1097 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.95 mfu: 46.14%
316
+ [titan] 2026-01-02 12:38:00,109 - root - INFO - lr: 4.2969e-06 gnorm: 94.50 [ 0:13:48<29 days, 10:58:33]
317
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - step: 11 loss: 12.5536 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
318
+ [titan] 2026-01-02 12:38:41,790 - root - INFO - lr: 4.6875e-06 gnorm: 82.00 [ 0:14:30<28 days, 3:00:22]
319
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - step: 12 loss: 12.0247 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.93 mfu: 46.13%
320
+ [titan] 2026-01-02 12:39:23,448 - root - INFO - lr: 5.0781e-06 gnorm: 71.50 [ 0:15:12<27 days, 0:20:52]
321
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - step: 13 loss: 11.6076 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.85 mfu: 46.10%
322
+ [titan] 2026-01-02 12:40:05,132 - root - INFO - lr: 5.4687e-06 gnorm: 68.50 [ 0:15:53<26 days, 1:48:18]
323
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - step: 14 loss: 11.2488 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.91 mfu: 46.13%
324
+ [titan] 2026-01-02 12:40:46,797 - root - INFO - lr: 5.8594e-06 gnorm: 63.75 [ 0:16:35<25 days, 6:28:11]
325
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - step: 15 loss: 10.9254 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
326
+ [titan] 2026-01-02 12:41:28,477 - root - INFO - lr: 6.2500e-06 gnorm: 55.50 [ 0:17:17<24 days, 13:43:10]
327
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - step: 16 loss: 10.6961 memory: 69.11GiB(87.20%) tps: 1,573 tflops: 143.94 mfu: 46.13%
328
+ [titan] 2026-01-02 12:42:10,134 - root - INFO - lr: 6.6406e-06 gnorm: 56.50 [ 0:17:58<23 days, 23:02:58]
329
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - step: 17 loss: 10.3915 memory: 69.11GiB(87.20%) tps: 1,572 tflops: 143.86 mfu: 46.11%
330
+ [titan] 2026-01-02 12:42:51,815 - root - INFO - lr: 7.0313e-06 gnorm: 42.75 [ 0:18:40<23 days, 10:06:57]
331
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - step: 18 loss: 10.1740 memory: 69.11GiB(87.20%) tps: 1,574 tflops: 143.98 mfu: 46.15%
332
+ [titan] 2026-01-02 12:43:33,461 - root - INFO - lr: 7.4219e-06 gnorm: 32.75 [ 0:19:22<22 days, 22:36:05]
logs/none_rci5peh0/attempt_0/7/stdout.log ADDED
File without changes