Add files using upload-large-folder tool
Browse files- .gitattributes +1 -0
- checkpoint/step-0/.metadata +0 -0
- checkpoint/step-144/__5_0.distcp +3 -0
- logs/none_4cvjdbqa/attempt_0/0/stderr.log +621 -0
- logs/none_4cvjdbqa/attempt_0/0/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/1/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/1/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/2/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/2/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/3/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/3/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/4/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/4/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/5/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/5/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/6/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/6/stdout.log +0 -0
- logs/none_4cvjdbqa/attempt_0/7/stderr.log +620 -0
- logs/none_4cvjdbqa/attempt_0/7/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/0/stderr.log +333 -0
- logs/none_rci5peh0/attempt_0/1/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/1/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/2/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/2/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/3/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/3/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/4/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/4/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/5/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/5/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/6/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/6/stdout.log +0 -0
- logs/none_rci5peh0/attempt_0/7/stderr.log +332 -0
- logs/none_rci5peh0/attempt_0/7/stdout.log +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
checkpoint/step-144/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
|
checkpoint/step-0/.metadata
ADDED
|
Binary file (92.5 kB). View file
|
|
|
checkpoint/step-144/__5_0.distcp
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:827419ab5b73e2042a48d1d0919abe6b8333a781c34797c15ad4cdff61a7e322
|
| 3 |
+
size 11004728080
|
logs/none_4cvjdbqa/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,621 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,615 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:30,017 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:30,025 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:30,028 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:30,029 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:30,029 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,053 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:31,008 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:31,010 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:31,010 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:31,012 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:31,014 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,167 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,229 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,229 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,230 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,359 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,736 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,786 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,786 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,481 - root - INFO - [GC] GC collection for checkpoint loading. 0.03 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,481 - root - INFO - Finished loading the checkpoint in 48.69 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:25,312 - root - ERROR - Failed to create WandB logger: No API key configured. Use `wandb login` to log in.
|
| 275 |
+
[titan] 2026-01-06 20:24:25,344 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 276 |
+
[titan] 2026-01-06 20:24:25,346 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 277 |
+
[titan] 2026-01-06 20:24:27,453 - root - INFO - Mixed precision training is handled by fully_shard
|
| 278 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [31m***** Running training *****[39m
|
| 279 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Training starts at step 2
|
| 280 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 281 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 282 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 283 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 286 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 287 |
+
[titan] 2026-01-06 20:24:27,454 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 288 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 289 |
+
torch._dynamo.utils.warn_once(msg)
|
| 290 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 291 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 292 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 293 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 159 [36mtflops: 14.55 [35mmfu: 4.66%[39m
|
| 295 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:13:56<14 days, 20:49:44][39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 297 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:38<10 days, 9:32:35][39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 299 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:19<8 days, 3:54:22][39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.88 [35mmfu: 46.44%[39m
|
| 301 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:00<6 days, 19:43:31][39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 303 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:42<5 days, 22:16:34][39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 305 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:23<5 days, 6:57:26][39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 307 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:05<4 days, 19:27:46][39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 309 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:46<4 days, 10:31:21][39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 311 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:28<4 days, 3:22:06][39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 313 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:09<3 days, 21:30:46][39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 315 |
+
[titan] 2026-01-06 20:38:11,961 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:51<3 days, 16:37:51][39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,462 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 317 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:32<3 days, 12:29:56][39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 319 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:14<3 days, 8:57:19][39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 321 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:22:55<3 days, 5:52:59][39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:37<3 days, 3:11:38][39m
|
| 324 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 325 |
+
[titan] 2026-01-06 20:41:27,039 - root - INFO - [GC] GC collection invoked by checkpointer. 0.59 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:41:27,039 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.07 seconds.
|
| 327 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.95GiB(90.78%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 328 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:48<3 days, 2:17:33][39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 330 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:29<3 days, 0:05:42][39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 332 |
+
[titan] 2026-01-06 20:43:31,925 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:11<2 days, 22:07:50][39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 334 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:52<2 days, 20:21:45][39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 336 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:34<2 days, 18:45:40][39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 338 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:15<2 days, 17:18:17][39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 340 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:28:57<2 days, 15:58:29][39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 342 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:38<2 days, 14:45:14][39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 344 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:20<2 days, 13:37:49][39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 346 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:01<2 days, 12:35:31][39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 348 |
+
[titan] 2026-01-06 20:49:04,179 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:43<2 days, 11:37:49][39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 350 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:25<2 days, 10:44:11][39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 352 |
+
[titan] 2026-01-06 20:50:27,275 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:06<2 days, 9:54:12][39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 354 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:48<2 days, 9:07:29][39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 356 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:29<2 days, 8:23:47][39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:11<2 days, 7:42:43][39m
|
| 359 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 360 |
+
[titan] 2026-01-06 20:52:52,181 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:52:52,181 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.27 seconds.
|
| 362 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 363 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:12<2 days, 7:35:02][39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 365 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:54<2 days, 6:57:38][39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 367 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:35<2 days, 6:22:22][39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 369 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:17<2 days, 5:49:03][39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 371 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:38:58<2 days, 5:17:32][39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 373 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:40<2 days, 4:47:38][39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 375 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:21<2 days, 4:19:13][39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 377 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:03<2 days, 3:52:12][39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 379 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:45<2 days, 3:26:28][39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,255 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 381 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:26<2 days, 3:01:54][39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 383 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:08<2 days, 2:38:28][39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 385 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:49<2 days, 2:16:05][39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 387 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:31<2 days, 1:54:39][39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 389 |
+
[titan] 2026-01-06 21:02:33,409 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:12<2 days, 1:34:07][39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 391 |
+
[titan] 2026-01-06 21:03:14,962 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:54<2 days, 1:14:27][39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:35<2 days, 0:55:34][39m
|
| 394 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 395 |
+
[titan] 2026-01-06 21:04:16,545 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:16,545 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.05 seconds.
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:37<2 days, 0:57:54][39m
|
| 399 |
+
[titan] 2026-01-06 21:04:57,982 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 400 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 401 |
+
[titan] 2026-01-06 21:05:39,422 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:18<2 days, 0:39:57][39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 403 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:00<2 days, 0:22:41][39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 405 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:41<2 days, 0:06:06][39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 407 |
+
[titan] 2026-01-06 21:07:43,931 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:23<1 day, 23:50:08][39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 409 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:04<1 day, 23:34:44][39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 411 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:46<1 day, 23:19:53][39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,535 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 413 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:27<1 day, 23:05:32][39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 415 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:09<1 day, 22:51:40][39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 417 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:50<1 day, 22:38:15][39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 419 |
+
[titan] 2026-01-06 21:11:53,170 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:32<1 day, 22:25:16][39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 421 |
+
[titan] 2026-01-06 21:12:34,709 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:13<1 day, 22:12:41][39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 423 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:55:55<1 day, 22:00:30][39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 425 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:37<1 day, 21:48:42][39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 427 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:18<1 day, 21:37:14][39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:00<1 day, 21:26:06][39m
|
| 430 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 431 |
+
[titan] 2026-01-06 21:15:41,914 - root - INFO - [GC] GC collection invoked by checkpointer. 0.15 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:15:41,915 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.04 seconds.
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 434 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:02<1 day, 21:31:22][39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 436 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:43<1 day, 21:20:33][39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,152 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 438 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:25<1 day, 21:10:02][39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 440 |
+
[titan] 2026-01-06 21:18:27,651 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:06<1 day, 20:59:51][39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 442 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:48<1 day, 20:49:57][39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 444 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:29<1 day, 20:40:19][39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 446 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:11<1 day, 20:30:57][39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 448 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:53<1 day, 20:21:49][39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 450 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:34<1 day, 20:12:55][39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 452 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:16<1 day, 20:04:15][39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 454 |
+
[titan] 2026-01-06 21:23:18,374 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:05:57<1 day, 19:55:47][39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 456 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:39<1 day, 19:47:31][39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 458 |
+
[titan] 2026-01-06 21:24:41,442 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:20<1 day, 19:39:27][39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,982 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 460 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:02<1 day, 19:31:35][39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 462 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:43<1 day, 19:23:53][39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:25<1 day, 19:16:22][39m
|
| 465 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 466 |
+
[titan] 2026-01-06 21:27:08,297 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:08,298 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.25 seconds.
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 469 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:28<1 day, 19:22:37][39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 471 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:10<1 day, 19:15:12][39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 473 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:51<1 day, 19:07:57][39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,023 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 475 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:33<1 day, 19:00:52][39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 477 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:14<1 day, 18:53:57][39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 479 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:13:56<1 day, 18:47:12][39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 481 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:37<1 day, 18:40:35][39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 483 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:19<1 day, 18:34:06][39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 485 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:00<1 day, 18:27:45][39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 487 |
+
[titan] 2026-01-06 21:34:03,123 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:42<1 day, 18:21:32][39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 489 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:23<1 day, 18:15:26][39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 491 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:05<1 day, 18:09:27][39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 493 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:46<1 day, 18:03:35][39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 495 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:28<1 day, 17:57:48][39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 497 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:09<1 day, 17:52:09][39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:51<1 day, 17:46:36][39m
|
| 500 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 501 |
+
[titan] 2026-01-06 21:38:32,910 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:38:32,910 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.66 seconds.
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 504 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:53<1 day, 17:51:38][39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 506 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:34<1 day, 17:46:07][39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,179 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:16<1 day, 17:40:44][39m
|
| 509 |
+
[titan] 2026-01-06 21:40:37,205 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 511 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:23:57<1 day, 17:35:28][39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 513 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:39<1 day, 17:30:17][39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 515 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:20<1 day, 17:25:11][39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,264 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 517 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:02<1 day, 17:20:10][39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 519 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:44<1 day, 17:15:15][39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 521 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:25<1 day, 17:10:24][39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,837 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 523 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:07<1 day, 17:05:38][39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 525 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:48<1 day, 17:00:57][39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 527 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:30<1 day, 16:56:20][39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 529 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:11<1 day, 16:51:48][39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 531 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:53<1 day, 16:47:20][39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 533 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:34<1 day, 16:42:56][39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:16<1 day, 16:38:36][39m
|
| 536 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 537 |
+
[titan] 2026-01-06 21:49:59,574 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:49:59,574 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
|
| 539 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,027 [36mtflops: 93.93 [35mmfu: 30.11%[39m
|
| 540 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:20<1 day, 16:44:04][39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 542 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:01<1 day, 16:39:43][39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 544 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:42<1 day, 16:35:27][39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 546 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:24<1 day, 16:31:16][39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,759 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 548 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:06<1 day, 16:27:08][39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 550 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:47<1 day, 16:23:05][39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 552 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:29<1 day, 16:19:05][39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 554 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:10<1 day, 16:15:08][39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,903 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 556 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:52<1 day, 16:11:15][39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 558 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:33<1 day, 16:07:25][39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 560 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:15<1 day, 16:03:38][39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 562 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:40:56<1 day, 15:59:54][39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 564 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:38<1 day, 15:56:13][39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,618 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 566 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:19<1 day, 15:52:35][39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 568 |
+
[titan] 2026-01-06 22:00:22,156 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:01<1 day, 15:49:00][39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:42<1 day, 15:45:27][39m
|
| 571 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 572 |
+
[titan] 2026-01-06 22:01:24,072 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:01:24,072 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.38 seconds.
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 575 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:44<1 day, 15:49:38][39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 577 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:26<1 day, 15:46:04][39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 579 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:07<1 day, 15:42:34][39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,858 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 581 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:49<1 day, 15:39:08][39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 583 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:30<1 day, 15:35:44][39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 585 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:12<1 day, 15:32:23][39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 587 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:53<1 day, 15:29:04][39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 589 |
+
[titan] 2026-01-06 22:06:56,028 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:35<1 day, 15:25:48][39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 591 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:16<1 day, 15:22:34][39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 593 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:50:58<1 day, 15:19:22][39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,687 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 595 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:39<1 day, 15:16:13][39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 597 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:21<1 day, 15:13:05][39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 599 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:03<1 day, 15:10:00][39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,350 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 601 |
+
[titan] 2026-01-06 22:11:05,351 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:44<1 day, 15:06:57][39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 603 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:26<1 day, 15:03:55][39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,460 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:07<1 day, 15:00:56][39m
|
| 606 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 607 |
+
[titan] 2026-01-06 22:12:50,136 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:12:50,136 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.68 seconds.
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 610 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:10<1 day, 15:05:12][39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 612 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:52<1 day, 15:02:11][39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 614 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:33<1 day, 14:59:12][39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 616 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:15<1 day, 14:56:15][39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:58:56<1 day, 14:53:21][39m
|
| 619 |
+
[titan] 2026-01-06 22:16:17,386 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.95GiB(90.78%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 621 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:38<1 day, 14:50:30][39m
|
logs/none_4cvjdbqa/attempt_0/0/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:29,972 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:29,975 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:29,977 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:29,977 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:29,977 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,129 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,335 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,717 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,764 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,764 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,447 - root - INFO - [GC] GC collection for checkpoint loading. 0.02 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,447 - root - INFO - Finished loading the checkpoint in 48.68 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,634 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,637 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,307 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,308 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,308 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,308 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:50:11][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:52:52][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:54:34][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:31:39][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:47<5 days, 22:56:40][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:48][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:10<4 days, 19:57:50][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:58:04][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:46:07][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:36][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:52][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,462 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:25][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:27][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:58][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,973 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:38][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,025 - root - INFO - [GC] GC collection invoked by checkpointer. 0.58 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,025 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.05 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:39][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:19:01][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,925 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:27][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:44][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:57:05][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:10][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:02<2 days, 16:08:53][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:13][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:23][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:43][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,179 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:40][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:43][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:26][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:27][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:29][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:11][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,193 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,193 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.28 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:16][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:59<2 days, 7:04:39][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:10][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:22<2 days, 5:55:41][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:58][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,138 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,138 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:54][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:20][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:09][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:16][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:34][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:44:00][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:29][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:56][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:17][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:30][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:30][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,564 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,565 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:45][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,982 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,422 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:42][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:20][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:40][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:37][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:08][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:12][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,535 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:45][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:49][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:20][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:17][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:38][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:23][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:30][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:59][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:48][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,933 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,934 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:35:00][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:07][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,152 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:34][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,651 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:19][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:22][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:41][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:16][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:05][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:09][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:26][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:55][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:37][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,442 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:31][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:36][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:52][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:30<1 day, 19:19:19][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,364 - root - INFO - [GC] GC collection invoked by checkpointer. 0.23 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,364 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.31 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:31][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:04][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:47][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,023 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:38<1 day, 19:03:40][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:43][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:01<1 day, 18:49:55][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:16][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:24<1 day, 18:36:46][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:23][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:08][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:18:00][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:12:00][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:06][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:18][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:36][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:02][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,931 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,931 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.68 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:02][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:30][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:06][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,204 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:48][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:35][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:28][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,264 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:26][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:29][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:38][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,837 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:50][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:08][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:30][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:56][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:27][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:02][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:21<1 day, 16:40:41][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,585 - root - INFO - [GC] GC collection invoked by checkpointer. 0.15 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,585 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.53 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.93 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:07][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:45][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:28][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:16][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:07][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:03][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:02][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:04][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:10][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:19][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:31][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:46][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:43<1 day, 15:58:05][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,618 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:26][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,156 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:49][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:16][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,122 - root - INFO - [GC] GC collection invoked by checkpointer. 0.25 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,122 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.43 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,453 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:26][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:51][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:20][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:53][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:28][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:06][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:47][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:30][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:15][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:02][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,687 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:52][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:44][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:38][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,350 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:34][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:32][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:32][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,175 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,175 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.71 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:48][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:46][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:46][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:48][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:54][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,390 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:02][39m
|
logs/none_4cvjdbqa/attempt_0/1/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:30,013 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:30,016 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:30,018 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:30,018 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:30,018 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,128 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,185 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,335 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,714 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,762 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,763 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,502 - root - INFO - [GC] GC collection for checkpoint loading. 0.04 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,502 - root - INFO - Finished loading the checkpoint in 48.74 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,704 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,707 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,476 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,477 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:48:25][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:51:41][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:53:40][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,590 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:30:56][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:46<5 days, 22:56:04][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:17][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:09<4 days, 19:57:23][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:57:40][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:45:46][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:17][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:34][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:08][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:12][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:44][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:25][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,085 - root - INFO - [GC] GC collection invoked by checkpointer. 0.64 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,085 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.11 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:26][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:18:49][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,925 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:15][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:33][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,967 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:56:54][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:00][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:01<2 days, 16:08:44][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:04][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:15][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:35][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:32][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:35][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:19][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:20][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:22][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:04][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,182 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,182 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.27 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:09][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:58<2 days, 7:04:33][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:04][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:21<2 days, 5:55:35][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:52][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,138 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:48][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:14][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:04][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:11][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,255 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:29][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:43:55][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:24][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:51][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:13][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:26][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:26][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,571 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,571 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:41][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,982 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:38][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:16][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:36][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:33][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:04][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:08][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,535 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:42][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:46][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:16][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:13][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:35][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:19][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:27][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:56][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:45][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,931 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,932 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:34:57][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:04][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:31][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:16][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:19][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:38][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:13][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:03][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:06][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:23][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,374 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:52][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:34][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,442 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:28][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:33][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:49][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:29<1 day, 19:19:16][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,315 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,316 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:29][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:01][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:44][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,023 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:37<1 day, 19:03:37][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:41][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:00<1 day, 18:49:53][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:14][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:23<1 day, 18:36:43][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:21][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:06][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:17:58][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:11:58][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:04][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:16][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:34][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:00][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,928 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,928 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.68 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:00][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:28][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,179 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:04][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,201 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:46][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:33][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:26][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:24][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:27][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:36][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:48][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:06][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:28][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:55][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:25][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:00][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:20<1 day, 16:40:39][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,573 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,574 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.93 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:06][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:44][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:26][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:14][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,759 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:06][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:01][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:00][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:03][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:09][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:18][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:30][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:45][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:42<1 day, 15:58:03][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,618 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:24][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:48][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:14][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,068 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,068 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.37 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,453 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:24][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:50][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:18][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:51][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:27][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:04][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:45][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,028 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:28][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:14][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:01][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,687 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:51][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:42][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:36][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:33][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:31][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:30][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,172 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,172 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.71 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:46][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:44][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:44][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:47][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:52][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,393 - root - INFO - [GC] Peforming periodical GC collection. 0.04 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:01][39m
|
logs/none_4cvjdbqa/attempt_0/2/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:29,998 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:30,002 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:30,004 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:30,004 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:30,004 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,129 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,334 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,714 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,761 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,763 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,456 - root - INFO - [GC] GC collection for checkpoint loading. 0.02 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,457 - root - INFO - Finished loading the checkpoint in 48.69 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,674 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,676 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,243 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:49:11][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:52:12][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:54:04][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,590 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:31:15][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:47<5 days, 22:56:20][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:31][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:09<4 days, 19:57:35][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:57:50][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:45:55][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:25][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,961 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:42][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,462 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:15][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:19][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:50][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,973 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:30][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,071 - root - INFO - [GC] GC collection invoked by checkpointer. 0.63 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,071 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.10 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:32][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:18:54][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,925 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:20][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:38][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:56:59][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:05][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:02<2 days, 16:08:48][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:08][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:18][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:38][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,179 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:36][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:38][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:22][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:23][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:25][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:07][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,190 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,190 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.28 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:12][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:59<2 days, 7:04:35][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:07][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:22<2 days, 5:55:37][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:55][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,138 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,138 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:51][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:16][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:06][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:13][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,255 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:31][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:43:57][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:26][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:53][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:15][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:28][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:28][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,564 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,564 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:43][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,981 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:39][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:18][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:38][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:34][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:05][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:09][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,535 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:43][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:47][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:18][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,170 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:15][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:36][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:21][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:28][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:57][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:46][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,928 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,928 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:34:58][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:05][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:32][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,651 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:18][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:20][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:39][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:14][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:04][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:07][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:24][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:54][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,908 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:35][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:29][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,982 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:34][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:50][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:29<1 day, 19:19:17][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,316 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,316 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:30][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:02][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:45][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,023 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:37<1 day, 19:03:38][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:42][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:00<1 day, 18:49:54][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:15][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:24<1 day, 18:36:44][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:22][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:07][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:17:59][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:11:58][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:05][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:16][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:35][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:01][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,933 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,933 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.69 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:01][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:29][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,179 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:04][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,205 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:47][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:34][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:27][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:25][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:28][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:36][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:49][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:07][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:29][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:55][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:26][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:01][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:20<1 day, 16:40:40][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,574 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,574 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.93 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:06][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:44][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:27][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:15][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:06][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:02][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:01][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:03][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:09][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:18][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:30][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:45][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:42<1 day, 15:58:04][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,618 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:25][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,156 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:48][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:15][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,075 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,076 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.38 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,453 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:25][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:50][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:19][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:52][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:27][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:05][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:46][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:29][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:14][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:02][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:51][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:43][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:37][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,350 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:33][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:31][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,460 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:31][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,173 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,174 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.71 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:47][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:45][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:45][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:47][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:53][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,394 - root - INFO - [GC] Peforming periodical GC collection. 0.04 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:01][39m
|
logs/none_4cvjdbqa/attempt_0/3/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:29,971 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:29,975 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:29,977 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:29,977 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:29,977 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,052 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,129 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,184 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,185 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,335 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,717 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,764 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,764 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,451 - root - INFO - [GC] GC collection for checkpoint loading. 0.03 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,451 - root - INFO - Finished loading the checkpoint in 48.69 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,662 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,664 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,325 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,326 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,326 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:49:30][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:52:24][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:54:13][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:31:23][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:47<5 days, 22:56:26][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:36][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:09<4 days, 19:57:39][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:57:54][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:45:59][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:29][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,961 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:45][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,462 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:18][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:21][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:53][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,973 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:33][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,081 - root - INFO - [GC] GC collection invoked by checkpointer. 0.64 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,081 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.11 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:34][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:18:56][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,925 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:22][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:39][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:57:01][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:06][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:02<2 days, 16:08:50][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:09][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:20][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:40][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,179 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:37][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:40][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:23][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:24][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:26][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:08][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,195 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,196 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.29 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:13][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:59<2 days, 7:04:37][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:08][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:22<2 days, 5:55:38][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:56][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:52][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:18][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:07][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:14][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,255 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:32][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:43:58][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:27][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:54][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:16][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:29][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:29][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,566 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,566 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:43][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,981 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,422 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:40][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:19][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:39][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:35][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:06][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:10][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,535 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:44][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.25%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:48][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:19][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,170 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:15][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,709 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:37][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:21][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:29][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:58][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:47][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,925 - root - INFO - [GC] GC collection invoked by checkpointer. 0.16 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,925 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.05 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:34:59][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:06][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,152 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:33][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:18][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:21][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,687 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:40][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:15][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:04][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:08][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:24][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:54][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,908 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:36][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:30][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:35][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:51][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:30<1 day, 19:19:18][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,314 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,314 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.26 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:31][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:03][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:46][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,023 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:37<1 day, 19:03:39][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:42][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:00<1 day, 18:49:55][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:15][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:24<1 day, 18:36:45][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:22][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:07][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:18:00][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:11:59][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:05][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:17][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:36][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:02][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,925 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,925 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.68 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:02][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:29][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:05][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,207 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:47][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:35][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:27][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:26][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:29][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:37][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:50][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:07][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:29][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:56][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:26][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:01][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:20<1 day, 16:40:40][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,564 - root - INFO - [GC] GC collection invoked by checkpointer. 0.13 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,565 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.51 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.93 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:07][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:45][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:27][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:15][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:07][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:02][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:01][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,361 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:04][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:10][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:19][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:31][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:46][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:43<1 day, 15:58:04][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,618 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:25][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,156 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:49][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:15][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,083 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,083 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.39 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,453 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:25][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:51][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:19][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:52][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:27][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:05][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:46][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:29][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:14][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:02][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,687 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:52][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:43][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:37][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:33][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:31][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,460 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:31][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,153 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,153 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.69 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:47][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:45][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:45][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:48][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:53][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,392 - root - INFO - [GC] Peforming periodical GC collection. 0.04 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:01][39m
|
logs/none_4cvjdbqa/attempt_0/4/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,615 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:29,968 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:29,972 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:29,974 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:29,974 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:29,974 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,052 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,412 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,412 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,129 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,186 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,333 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,716 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,763 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,763 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,481 - root - INFO - [GC] GC collection for checkpoint loading. 0.02 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,481 - root - INFO - Finished loading the checkpoint in 48.72 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,697 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,699 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,481 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,481 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,481 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,481 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,481 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,481 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,482 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,482 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,482 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,482 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,482 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:48:36][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:51:49][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:53:46][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:31:01][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:46<5 days, 22:56:08][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:21][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:09<4 days, 19:57:26][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:57:42][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:45:48][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:19][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:36][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,462 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:10][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:14][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:46][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,973 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:26][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,037 - root - INFO - [GC] GC collection invoked by checkpointer. 0.60 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,037 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.06 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:28][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:18:50][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:17][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:34][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,967 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:56:56][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:02][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:01<2 days, 16:08:45][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:05][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:16][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:36][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:33][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:36][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:20][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:21][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:23][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:05][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,195 - root - INFO - [GC] GC collection invoked by checkpointer. 0.20 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,195 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.29 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,590 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:10][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:58<2 days, 7:04:33][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:05][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:22<2 days, 5:55:35][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:53][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:49][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:15][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:05][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:12][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,255 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:30][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:43:56][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:25][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:52][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:13][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:27][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:27][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,632 - root - INFO - [GC] GC collection invoked by checkpointer. 0.24 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,632 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.14 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:41][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,979 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:38][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:17][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:37][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:33][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:04][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:08][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:42][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:46][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:17][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:14][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:35][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:20][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:28][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:56][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:45][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,925 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,925 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.05 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:34:57][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:05][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:31][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,651 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:17][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:20][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:39][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:13][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:03][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:06][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:23][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:53][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,908 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:35][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,442 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:28][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:34][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:50][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:29<1 day, 19:19:16][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,316 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,317 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:29][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:02][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:45][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,023 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:37<1 day, 19:03:38][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:41][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:00<1 day, 18:49:53][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:14][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:23<1 day, 18:36:44][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:21][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:06][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:17:58][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:11:58][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:04][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:16][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:35][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:01][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,921 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,922 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.67 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:01][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:28][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,179 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:04][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,200 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:46][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:34][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:26][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,264 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:25][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:28][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:36][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:49][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:06][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:28][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:55][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:25][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:00][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.25%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:20<1 day, 16:40:39][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,578 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,578 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.52 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.93 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:06][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:44][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:26][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:14][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,759 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:06][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:01][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:00][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:03][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:09][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:18][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:30][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:45][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:42<1 day, 15:58:03][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:24][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,156 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:48][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:14][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,082 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,082 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.39 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:25][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:50][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:19][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:51][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:27][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:05][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:45][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:28][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:14][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:01][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:51][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:43][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:37][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:33][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:31][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:31][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,164 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,164 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.70 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:46][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:45][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:45][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:47][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:53][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,386 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:01][39m
|
logs/none_4cvjdbqa/attempt_0/5/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:29,958 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:29,965 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:29,967 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:29,967 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:29,967 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,452 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,452 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,452 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,979 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,983 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,984 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,130 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,187 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,188 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,352 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,724 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,773 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,773 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,506 - root - INFO - [GC] GC collection for checkpoint loading. 0.04 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,506 - root - INFO - Finished loading the checkpoint in 48.73 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,703 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,706 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,545 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,545 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,545 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,545 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,545 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,545 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,546 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,546 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,546 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,546 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,546 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:48:26][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:51:42][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:53:41][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,590 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:30:57][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:46<5 days, 22:56:05][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:18][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:09<4 days, 19:57:23][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,481 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:57:40][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:45:46][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:17][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:34][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,462 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:08][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:12][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:44][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,973 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:25][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,686 - root - INFO - [GC] GC collection invoked by checkpointer. 1.24 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,686 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.71 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:27][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:18:49][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,925 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:16][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,452 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:33][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:56:55][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:00][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:01<2 days, 16:08:44][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:04][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:15][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:35][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,179 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:32][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:35][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,275 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:19][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:20][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:22][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:04][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,184 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,185 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.27 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:09][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:58<2 days, 7:04:33][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:04][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:21<2 days, 5:55:35][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:52][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:48][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:14][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:04][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:11][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:29][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:43:55][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:24][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,868 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:51][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:13][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:26][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:26][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,560 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,560 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.06 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:41][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,979 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,422 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:38][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:16][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:36][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:33][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:04][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:08][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,535 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:42][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:45][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:16][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,170 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:13][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:35][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:19][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:27][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:56][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:45][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,932 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,932 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:34:57][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:04][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,152 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:31][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,651 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:16][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:19][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,687 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:38][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:13][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:02][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:06][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:23][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,374 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:52][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:34][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,442 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:28][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:33][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:49][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:29<1 day, 19:19:16][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,317 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,318 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:29][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:01][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:44][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:37<1 day, 19:03:37][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:41][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:00<1 day, 18:49:53][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:14][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:23<1 day, 18:36:43][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,598 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:21][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:06][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:17:58][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:11:58][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,713 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:04][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:16][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:34][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:00][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,953 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,954 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.71 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:00][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:28][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:03][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,206 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:46][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:33][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:26][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:24][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:27][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:36][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:48][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:06][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:28][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:54][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:25][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:00][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,060 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:20<1 day, 16:40:39][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,060 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,579 - root - INFO - [GC] GC collection invoked by checkpointer. 0.15 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,579 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.52 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.94 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:05][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:43][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:26][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:14][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:05][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:01][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:00][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:03][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:09][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:18][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:30][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:45][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:42<1 day, 15:58:03][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:24][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,156 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:47][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:14][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,077 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,078 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.38 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,453 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:24][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:49][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:18][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,858 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:51][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:26][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:04][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:45][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:28][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:13][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:01][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,687 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:51][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:42][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:36][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,349 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:32][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:30][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,460 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:30][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,165 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,166 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.70 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:46][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:44][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:44][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:47][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:52][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,387 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:00][39m
|
logs/none_4cvjdbqa/attempt_0/6/stdout.log
ADDED
|
File without changes
|
logs/none_4cvjdbqa/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-06 20:23:28,613 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 16,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 3072,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-06 20:23:28,614 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-06 20:23:29,962 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-06 20:23:29,965 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-06 20:23:29,967 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-06 20:23:29,967 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-06 20:23:29,967 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-06 20:23:30,051 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-06 20:23:30,424 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-06 20:23:30,424 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-06 20:23:30,424 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-06 20:23:30,977 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-06 20:23:30,978 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-06 20:23:30,980 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-06 20:23:30,981 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-06 20:23:31,128 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-06 20:23:31,189 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-06 20:23:31,189 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-06 20:23:31,190 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-06 20:23:31,333 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-06 20:23:31,724 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-06 20:23:31,773 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-06 20:23:31,774 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-1.
|
| 272 |
+
[titan] 2026-01-06 20:24:20,476 - root - INFO - [GC] GC collection for checkpoint loading. 0.03 seconds.
|
| 273 |
+
[titan] 2026-01-06 20:24:20,476 - root - INFO - Finished loading the checkpoint in 48.70 seconds.
|
| 274 |
+
[titan] 2026-01-06 20:24:20,701 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-06 20:24:20,704 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-06 20:24:23,285 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Training starts at step 2
|
| 279 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Total optimization steps = 3,072 (1,610,612,736 tokens)
|
| 284 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-06 20:24:23,286 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 71.94GiB(90.77%) [34mtps: 157 [36mtflops: 14.38 [35mmfu: 4.61%[39m
|
| 294 |
+
[titan] 2026-01-06 20:31:17,558 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:14:01<14 days, 22:48:29][39m
|
| 295 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [31mstep: 3 [32mloss: 14.3925 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,587 [36mtflops: 145.20 [35mmfu: 46.54%[39m
|
| 296 |
+
[titan] 2026-01-06 20:31:58,854 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:14:42<10 days, 10:51:44][39m
|
| 297 |
+
[titan] 2026-01-06 20:32:40,204 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,585 [36mtflops: 145.01 [35mmfu: 46.48%[39m
|
| 298 |
+
[titan] 2026-01-06 20:32:40,205 - root - INFO - [34mlr: 1.9531e-06 gnorm: 125.50 [35m[ 0:15:24<8 days, 4:53:43][39m
|
| 299 |
+
[titan] 2026-01-06 20:33:21,589 - root - INFO - [31mstep: 5 [32mloss: 14.2679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,584 [36mtflops: 144.89 [35mmfu: 46.44%[39m
|
| 300 |
+
[titan] 2026-01-06 20:33:21,590 - root - INFO - [34mlr: 2.3438e-06 gnorm: 123.50 [35m[ 0:16:05<6 days, 20:30:59][39m
|
| 301 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [31mstep: 6 [32mloss: 13.9921 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 302 |
+
[titan] 2026-01-06 20:34:03,035 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.50 [35m[ 0:16:46<5 days, 22:56:06][39m
|
| 303 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [31mstep: 7 [32mloss: 13.8102 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 304 |
+
[titan] 2026-01-06 20:34:44,524 - root - INFO - [34mlr: 3.1250e-06 gnorm: 112.50 [35m[ 0:17:28<5 days, 7:31:19][39m
|
| 305 |
+
[titan] 2026-01-06 20:35:25,989 - root - INFO - [31mstep: 8 [32mloss: 13.5609 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.60 [35mmfu: 46.35%[39m
|
| 306 |
+
[titan] 2026-01-06 20:35:25,990 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:18:09<4 days, 19:57:24][39m
|
| 307 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [31mstep: 9 [32mloss: 13.3683 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 308 |
+
[titan] 2026-01-06 20:36:07,480 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:18:51<4 days, 10:57:41][39m
|
| 309 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [31mstep: 10 [32mloss: 13.1018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.32%[39m
|
| 310 |
+
[titan] 2026-01-06 20:36:48,975 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.00 [35m[ 0:19:32<4 days, 3:45:47][39m
|
| 311 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [31mstep: 11 [32mloss: 12.5407 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 312 |
+
[titan] 2026-01-06 20:37:30,471 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:20:14<3 days, 21:52:18][39m
|
| 313 |
+
[titan] 2026-01-06 20:38:11,960 - root - INFO - [31mstep: 12 [32mloss: 12.0106 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 314 |
+
[titan] 2026-01-06 20:38:11,961 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:20:55<3 days, 16:57:35][39m
|
| 315 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [31mstep: 13 [32mloss: 11.5957 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 316 |
+
[titan] 2026-01-06 20:38:53,463 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.00 [35m[ 0:21:37<3 days, 12:48:09][39m
|
| 317 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [31mstep: 14 [32mloss: 11.2380 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.51 [35mmfu: 46.32%[39m
|
| 318 |
+
[titan] 2026-01-06 20:39:34,955 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.25 [35m[ 0:22:18<3 days, 9:14:13][39m
|
| 319 |
+
[titan] 2026-01-06 20:40:16,456 - root - INFO - [31mstep: 15 [32mloss: 10.9153 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.48 [35mmfu: 46.31%[39m
|
| 320 |
+
[titan] 2026-01-06 20:40:16,457 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:23:00<3 days, 6:08:45][39m
|
| 321 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [31mstep: 16 [32mloss: 10.6864 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 322 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - [34mlr: 6.6406e-06 gnorm: 57.00 [35m[ 0:23:41<3 days, 3:26:25][39m
|
| 323 |
+
[titan] 2026-01-06 20:40:57,974 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 324 |
+
[titan] 2026-01-06 20:41:27,201 - root - INFO - [GC] GC collection invoked by checkpointer. 0.76 seconds.
|
| 325 |
+
[titan] 2026-01-06 20:41:27,201 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.23 seconds.
|
| 326 |
+
[titan] 2026-01-06 20:42:08,985 - root - INFO - [31mstep: 17 [32mloss: 10.3828 [33mmemory: 71.94GiB(90.77%) [34mtps: 923 [36mtflops: 84.44 [35mmfu: 27.06%[39m
|
| 327 |
+
[titan] 2026-01-06 20:42:08,986 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.50 [35m[ 0:24:52<3 days, 2:31:27][39m
|
| 328 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [31mstep: 18 [32mloss: 10.1659 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 329 |
+
[titan] 2026-01-06 20:42:50,422 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.50 [35m[ 0:25:34<3 days, 0:18:50][39m
|
| 330 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [31mstep: 19 [32mloss: 9.9749 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.31%[39m
|
| 331 |
+
[titan] 2026-01-06 20:43:31,924 - root - INFO - [34mlr: 7.8125e-06 gnorm: 26.88 [35m[ 0:26:15<2 days, 22:20:16][39m
|
| 332 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [31mstep: 20 [32mloss: 9.8084 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 333 |
+
[titan] 2026-01-06 20:44:13,451 - root - INFO - [34mlr: 8.2031e-06 gnorm: 25.62 [35m[ 0:26:57<2 days, 20:33:33][39m
|
| 334 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [31mstep: 21 [32mloss: 9.6201 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 335 |
+
[titan] 2026-01-06 20:44:54,968 - root - INFO - [34mlr: 8.5938e-06 gnorm: 26.88 [35m[ 0:27:38<2 days, 18:56:55][39m
|
| 336 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [31mstep: 22 [32mloss: 9.4905 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 337 |
+
[titan] 2026-01-06 20:45:36,491 - root - INFO - [34mlr: 8.9844e-06 gnorm: 25.50 [35m[ 0:28:20<2 days, 17:29:01][39m
|
| 338 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [31mstep: 23 [32mloss: 9.2526 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 339 |
+
[titan] 2026-01-06 20:46:18,035 - root - INFO - [34mlr: 9.3750e-06 gnorm: 19.12 [35m[ 0:29:01<2 days, 16:08:45][39m
|
| 340 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [31mstep: 24 [32mloss: 9.0528 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 341 |
+
[titan] 2026-01-06 20:46:59,563 - root - INFO - [34mlr: 9.7656e-06 gnorm: 17.00 [35m[ 0:29:43<2 days, 14:55:04][39m
|
| 342 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [31mstep: 25 [32mloss: 8.8601 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 343 |
+
[titan] 2026-01-06 20:47:41,099 - root - INFO - [34mlr: 1.0156e-05 gnorm: 14.06 [35m[ 0:30:25<2 days, 13:47:15][39m
|
| 344 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [31mstep: 26 [32mloss: 8.7360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.27%[39m
|
| 345 |
+
[titan] 2026-01-06 20:48:22,630 - root - INFO - [34mlr: 1.0547e-05 gnorm: 15.44 [35m[ 0:31:06<2 days, 12:44:35][39m
|
| 346 |
+
[titan] 2026-01-06 20:49:04,178 - root - INFO - [31mstep: 27 [32mloss: 8.6182 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 347 |
+
[titan] 2026-01-06 20:49:04,179 - root - INFO - [34mlr: 1.0937e-05 gnorm: 10.25 [35m[ 0:31:48<2 days, 11:46:32][39m
|
| 348 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [31mstep: 28 [32mloss: 8.5142 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 349 |
+
[titan] 2026-01-06 20:49:45,725 - root - INFO - [34mlr: 1.1328e-05 gnorm: 9.00 [35m[ 0:32:29<2 days, 10:52:36][39m
|
| 350 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [31mstep: 29 [32mloss: 8.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 351 |
+
[titan] 2026-01-06 20:50:27,274 - root - INFO - [34mlr: 1.1719e-05 gnorm: 9.44 [35m[ 0:33:11<2 days, 10:02:19][39m
|
| 352 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [31mstep: 30 [32mloss: 8.3888 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 353 |
+
[titan] 2026-01-06 20:51:08,813 - root - INFO - [34mlr: 1.2109e-05 gnorm: 7.06 [35m[ 0:33:52<2 days, 9:15:20][39m
|
| 354 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [31mstep: 31 [32mloss: 8.3098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 355 |
+
[titan] 2026-01-06 20:51:50,370 - root - INFO - [34mlr: 1.2500e-05 gnorm: 5.38 [35m[ 0:34:34<2 days, 8:31:22][39m
|
| 356 |
+
[titan] 2026-01-06 20:52:31,909 - root - INFO - [31mstep: 32 [32mloss: 8.2507 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 357 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - [34mlr: 1.2891e-05 gnorm: 6.97 [35m[ 0:35:15<2 days, 7:50:05][39m
|
| 358 |
+
[titan] 2026-01-06 20:52:31,910 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 359 |
+
[titan] 2026-01-06 20:52:52,213 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 360 |
+
[titan] 2026-01-06 20:52:52,213 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.30 seconds.
|
| 361 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [31mstep: 33 [32mloss: 8.1782 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,063 [36mtflops: 97.21 [35mmfu: 31.16%[39m
|
| 362 |
+
[titan] 2026-01-06 20:53:33,591 - root - INFO - [34mlr: 1.3281e-05 gnorm: 4.94 [35m[ 0:36:17<2 days, 7:42:10][39m
|
| 363 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [31mstep: 34 [32mloss: 8.1399 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 364 |
+
[titan] 2026-01-06 20:54:15,059 - root - INFO - [34mlr: 1.3672e-05 gnorm: 4.62 [35m[ 0:36:58<2 days, 7:04:33][39m
|
| 365 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [31mstep: 35 [32mloss: 8.1046 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.53 [35mmfu: 46.32%[39m
|
| 366 |
+
[titan] 2026-01-06 20:54:56,546 - root - INFO - [34mlr: 1.4063e-05 gnorm: 4.69 [35m[ 0:37:40<2 days, 6:29:05][39m
|
| 367 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [31mstep: 36 [32mloss: 8.0122 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 368 |
+
[titan] 2026-01-06 20:55:38,070 - root - INFO - [34mlr: 1.4453e-05 gnorm: 2.75 [35m[ 0:38:22<2 days, 5:55:35][39m
|
| 369 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [31mstep: 37 [32mloss: 8.0874 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 370 |
+
[titan] 2026-01-06 20:56:19,603 - root - INFO - [34mlr: 1.4844e-05 gnorm: 4.84 [35m[ 0:39:03<2 days, 5:23:53][39m
|
| 371 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [31mstep: 38 [32mloss: 8.0173 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 372 |
+
[titan] 2026-01-06 20:57:01,137 - root - INFO - [34mlr: 1.5234e-05 gnorm: 3.98 [35m[ 0:39:45<2 days, 4:53:48][39m
|
| 373 |
+
[titan] 2026-01-06 20:57:42,670 - root - INFO - [31mstep: 39 [32mloss: 8.0002 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 374 |
+
[titan] 2026-01-06 20:57:42,671 - root - INFO - [34mlr: 1.5625e-05 gnorm: 3.81 [35m[ 0:40:26<2 days, 4:25:14][39m
|
| 375 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [31mstep: 40 [32mloss: 7.9606 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 376 |
+
[titan] 2026-01-06 20:58:24,204 - root - INFO - [34mlr: 1.6016e-05 gnorm: 2.86 [35m[ 0:41:08<2 days, 3:58:04][39m
|
| 377 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [31mstep: 41 [32mloss: 7.9773 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 378 |
+
[titan] 2026-01-06 20:59:05,739 - root - INFO - [34mlr: 1.6406e-05 gnorm: 3.56 [35m[ 0:41:49<2 days, 3:32:11][39m
|
| 379 |
+
[titan] 2026-01-06 20:59:47,255 - root - INFO - [31mstep: 42 [32mloss: 7.9890 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 380 |
+
[titan] 2026-01-06 20:59:47,256 - root - INFO - [34mlr: 1.6797e-05 gnorm: 4.75 [35m[ 0:42:31<2 days, 3:07:29][39m
|
| 381 |
+
[titan] 2026-01-06 21:00:28,788 - root - INFO - [31mstep: 43 [32mloss: 7.9018 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 382 |
+
[titan] 2026-01-06 21:00:28,789 - root - INFO - [34mlr: 1.7188e-05 gnorm: 3.48 [35m[ 0:43:12<2 days, 2:43:55][39m
|
| 383 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [31mstep: 44 [32mloss: 7.8441 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 384 |
+
[titan] 2026-01-06 21:01:10,328 - root - INFO - [34mlr: 1.7578e-05 gnorm: 3.89 [35m[ 0:43:54<2 days, 2:21:24][39m
|
| 385 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [31mstep: 45 [32mloss: 7.8679 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 386 |
+
[titan] 2026-01-06 21:01:51,869 - root - INFO - [34mlr: 1.7969e-05 gnorm: 6.41 [35m[ 0:44:35<2 days, 1:59:51][39m
|
| 387 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [31mstep: 46 [32mloss: 7.7830 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 388 |
+
[titan] 2026-01-06 21:02:33,408 - root - INFO - [34mlr: 1.8359e-05 gnorm: 3.52 [35m[ 0:45:17<2 days, 1:39:13][39m
|
| 389 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [31mstep: 47 [32mloss: 7.8372 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 390 |
+
[titan] 2026-01-06 21:03:14,961 - root - INFO - [34mlr: 1.8750e-05 gnorm: 2.22 [35m[ 0:45:58<2 days, 1:19:26][39m
|
| 391 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [31mstep: 48 [32mloss: 7.8147 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 392 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - [34mlr: 1.9141e-05 gnorm: 3.70 [35m[ 0:46:40<2 days, 1:00:26][39m
|
| 393 |
+
[titan] 2026-01-06 21:03:56,497 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 394 |
+
[titan] 2026-01-06 21:04:16,562 - root - INFO - [GC] GC collection invoked by checkpointer. 0.18 seconds.
|
| 395 |
+
[titan] 2026-01-06 21:04:16,562 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.07 seconds.
|
| 396 |
+
[titan] 2026-01-06 21:04:57,969 - root - INFO - [31mstep: 49 [32mloss: 7.6970 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,066 [36mtflops: 97.54 [35mmfu: 31.26%[39m
|
| 397 |
+
[titan] 2026-01-06 21:04:57,970 - root - INFO - [34mlr: 1.9531e-05 gnorm: 5.28 [35m[ 0:47:41<2 days, 1:02:41][39m
|
| 398 |
+
[titan] 2026-01-06 21:04:57,981 - root - INFO - [GC] Peforming periodical GC collection. 0.01 seconds.
|
| 399 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [31mstep: 50 [32mloss: 7.7536 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.65 [35mmfu: 46.36%[39m
|
| 400 |
+
[titan] 2026-01-06 21:05:39,421 - root - INFO - [34mlr: 1.9922e-05 gnorm: 4.06 [35m[ 0:48:23<2 days, 0:44:38][39m
|
| 401 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [31mstep: 51 [32mloss: 7.7578 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 402 |
+
[titan] 2026-01-06 21:06:20,891 - root - INFO - [34mlr: 2.0313e-05 gnorm: 5.03 [35m[ 0:49:04<2 days, 0:27:17][39m
|
| 403 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [31mstep: 52 [32mloss: 7.7586 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 404 |
+
[titan] 2026-01-06 21:07:02,402 - root - INFO - [34mlr: 2.0703e-05 gnorm: 2.52 [35m[ 0:49:46<2 days, 0:10:36][39m
|
| 405 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [31mstep: 53 [32mloss: 7.7823 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 406 |
+
[titan] 2026-01-06 21:07:43,930 - root - INFO - [34mlr: 2.1094e-05 gnorm: 11.69 [35m[ 0:50:27<1 day, 23:54:33][39m
|
| 407 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [31mstep: 54 [32mloss: 7.7454 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 408 |
+
[titan] 2026-01-06 21:08:25,460 - root - INFO - [34mlr: 2.1484e-05 gnorm: 10.25 [35m[ 0:51:09<1 day, 23:39:04][39m
|
| 409 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [31mstep: 55 [32mloss: 7.6959 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 410 |
+
[titan] 2026-01-06 21:09:07,002 - root - INFO - [34mlr: 2.1875e-05 gnorm: 3.77 [35m[ 0:51:50<1 day, 23:24:08][39m
|
| 411 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [31mstep: 56 [32mloss: 7.7100 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 412 |
+
[titan] 2026-01-06 21:09:48,536 - root - INFO - [34mlr: 2.2266e-05 gnorm: 5.50 [35m[ 0:52:32<1 day, 23:09:42][39m
|
| 413 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [31mstep: 57 [32mloss: 7.6427 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 414 |
+
[titan] 2026-01-06 21:10:30,084 - root - INFO - [34mlr: 2.2656e-05 gnorm: 3.45 [35m[ 0:53:14<1 day, 22:55:46][39m
|
| 415 |
+
[titan] 2026-01-06 21:11:11,627 - root - INFO - [31mstep: 58 [32mloss: 7.7081 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 416 |
+
[titan] 2026-01-06 21:11:11,628 - root - INFO - [34mlr: 2.3047e-05 gnorm: 7.88 [35m[ 0:53:55<1 day, 22:42:17][39m
|
| 417 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [31mstep: 59 [32mloss: 7.6955 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 418 |
+
[titan] 2026-01-06 21:11:53,169 - root - INFO - [34mlr: 2.3438e-05 gnorm: 7.16 [35m[ 0:54:37<1 day, 22:29:13][39m
|
| 419 |
+
[titan] 2026-01-06 21:12:34,708 - root - INFO - [31mstep: 60 [32mloss: 7.6458 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 420 |
+
[titan] 2026-01-06 21:12:34,709 - root - INFO - [34mlr: 2.3828e-05 gnorm: 3.22 [35m[ 0:55:18<1 day, 22:16:35][39m
|
| 421 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [31mstep: 61 [32mloss: 7.6709 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 422 |
+
[titan] 2026-01-06 21:13:16,244 - root - INFO - [34mlr: 2.4219e-05 gnorm: 7.56 [35m[ 0:56:00<1 day, 22:04:19][39m
|
| 423 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [31mstep: 62 [32mloss: 7.6777 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 424 |
+
[titan] 2026-01-06 21:13:57,793 - root - INFO - [34mlr: 2.4609e-05 gnorm: 5.00 [35m[ 0:56:41<1 day, 21:52:27][39m
|
| 425 |
+
[titan] 2026-01-06 21:14:39,339 - root - INFO - [31mstep: 63 [32mloss: 7.6421 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 426 |
+
[titan] 2026-01-06 21:14:39,340 - root - INFO - [34mlr: 2.5000e-05 gnorm: 6.81 [35m[ 0:57:23<1 day, 21:40:56][39m
|
| 427 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [31mstep: 64 [32mloss: 7.6401 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 428 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - [34mlr: 2.5391e-05 gnorm: 6.72 [35m[ 0:58:04<1 day, 21:29:45][39m
|
| 429 |
+
[titan] 2026-01-06 21:15:20,872 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 430 |
+
[titan] 2026-01-06 21:15:41,936 - root - INFO - [GC] GC collection invoked by checkpointer. 0.17 seconds.
|
| 431 |
+
[titan] 2026-01-06 21:15:41,936 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.06 seconds.
|
| 432 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [31mstep: 65 [32mloss: 7.6475 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,051 [36mtflops: 96.13 [35mmfu: 30.81%[39m
|
| 433 |
+
[titan] 2026-01-06 21:16:23,249 - root - INFO - [34mlr: 2.5781e-05 gnorm: 5.00 [35m[ 0:59:07<1 day, 21:34:57][39m
|
| 434 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [31mstep: 66 [32mloss: 7.7008 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.69 [35mmfu: 46.38%[39m
|
| 435 |
+
[titan] 2026-01-06 21:17:04,689 - root - INFO - [34mlr: 2.6172e-05 gnorm: 9.69 [35m[ 0:59:48<1 day, 21:24:04][39m
|
| 436 |
+
[titan] 2026-01-06 21:17:46,152 - root - INFO - [31mstep: 67 [32mloss: 7.6772 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 437 |
+
[titan] 2026-01-06 21:17:46,153 - root - INFO - [34mlr: 2.6563e-05 gnorm: 8.06 [35m[ 1:00:30<1 day, 21:13:31][39m
|
| 438 |
+
[titan] 2026-01-06 21:18:27,650 - root - INFO - [31mstep: 68 [32mloss: 7.6251 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.49 [35mmfu: 46.31%[39m
|
| 439 |
+
[titan] 2026-01-06 21:18:27,651 - root - INFO - [34mlr: 2.6953e-05 gnorm: 7.88 [35m[ 1:01:11<1 day, 21:03:16][39m
|
| 440 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [31mstep: 69 [32mloss: 7.6183 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.43 [35mmfu: 46.29%[39m
|
| 441 |
+
[titan] 2026-01-06 21:19:09,166 - root - INFO - [34mlr: 2.7344e-05 gnorm: 4.00 [35m[ 1:01:53<1 day, 20:53:19][39m
|
| 442 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [31mstep: 70 [32mloss: 7.6535 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 443 |
+
[titan] 2026-01-06 21:19:50,686 - root - INFO - [34mlr: 2.7734e-05 gnorm: 17.75 [35m[ 1:02:34<1 day, 20:43:38][39m
|
| 444 |
+
[titan] 2026-01-06 21:20:32,220 - root - INFO - [31mstep: 71 [32mloss: 7.6713 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 445 |
+
[titan] 2026-01-06 21:20:32,221 - root - INFO - [34mlr: 2.8125e-05 gnorm: 15.69 [35m[ 1:03:16<1 day, 20:34:13][39m
|
| 446 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [31mstep: 72 [32mloss: 7.5969 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 447 |
+
[titan] 2026-01-06 21:21:13,759 - root - INFO - [34mlr: 2.8516e-05 gnorm: 5.00 [35m[ 1:03:57<1 day, 20:25:03][39m
|
| 448 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [31mstep: 73 [32mloss: 7.6514 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 449 |
+
[titan] 2026-01-06 21:21:55,296 - root - INFO - [34mlr: 2.8906e-05 gnorm: 7.84 [35m[ 1:04:39<1 day, 20:16:06][39m
|
| 450 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [31mstep: 74 [32mloss: 7.6118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 451 |
+
[titan] 2026-01-06 21:22:36,834 - root - INFO - [34mlr: 2.9297e-05 gnorm: 5.53 [35m[ 1:05:20<1 day, 20:07:23][39m
|
| 452 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [31mstep: 75 [32mloss: 7.6545 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.26%[39m
|
| 453 |
+
[titan] 2026-01-06 21:23:18,373 - root - INFO - [34mlr: 2.9687e-05 gnorm: 14.88 [35m[ 1:06:02<1 day, 19:58:53][39m
|
| 454 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [31mstep: 76 [32mloss: 7.6091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 455 |
+
[titan] 2026-01-06 21:23:59,909 - root - INFO - [34mlr: 3.0078e-05 gnorm: 15.25 [35m[ 1:06:43<1 day, 19:50:34][39m
|
| 456 |
+
[titan] 2026-01-06 21:24:41,441 - root - INFO - [31mstep: 77 [32mloss: 7.5815 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 457 |
+
[titan] 2026-01-06 21:24:41,442 - root - INFO - [34mlr: 3.0469e-05 gnorm: 4.84 [35m[ 1:07:25<1 day, 19:42:28][39m
|
| 458 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [31mstep: 78 [32mloss: 7.6119 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 459 |
+
[titan] 2026-01-06 21:25:22,983 - root - INFO - [34mlr: 3.0859e-05 gnorm: 9.06 [35m[ 1:08:06<1 day, 19:34:33][39m
|
| 460 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [31mstep: 79 [32mloss: 7.6418 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 461 |
+
[titan] 2026-01-06 21:26:04,516 - root - INFO - [34mlr: 3.1250e-05 gnorm: 8.25 [35m[ 1:08:48<1 day, 19:26:50][39m
|
| 462 |
+
[titan] 2026-01-06 21:26:46,049 - root - INFO - [31mstep: 80 [32mloss: 7.5575 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 463 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - [34mlr: 3.1641e-05 gnorm: 6.97 [35m[ 1:09:29<1 day, 19:19:16][39m
|
| 464 |
+
[titan] 2026-01-06 21:26:46,050 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 465 |
+
[titan] 2026-01-06 21:27:08,317 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 466 |
+
[titan] 2026-01-06 21:27:08,317 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.27 seconds.
|
| 467 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [31mstep: 81 [32mloss: 7.6005 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,030 [36mtflops: 94.22 [35mmfu: 30.20%[39m
|
| 468 |
+
[titan] 2026-01-06 21:27:49,686 - root - INFO - [34mlr: 3.2031e-05 gnorm: 7.19 [35m[ 1:10:33<1 day, 19:25:29][39m
|
| 469 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [31mstep: 82 [32mloss: 7.5774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 470 |
+
[titan] 2026-01-06 21:28:31,108 - root - INFO - [34mlr: 3.2422e-05 gnorm: 5.62 [35m[ 1:11:15<1 day, 19:18:01][39m
|
| 471 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [31mstep: 83 [32mloss: 7.6207 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.67 [35mmfu: 46.37%[39m
|
| 472 |
+
[titan] 2026-01-06 21:29:12,555 - root - INFO - [34mlr: 3.2813e-05 gnorm: 4.69 [35m[ 1:11:56<1 day, 19:10:44][39m
|
| 473 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [31mstep: 84 [32mloss: 7.5734 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 474 |
+
[titan] 2026-01-06 21:29:54,024 - root - INFO - [34mlr: 3.3203e-05 gnorm: 10.75 [35m[ 1:12:37<1 day, 19:03:37][39m
|
| 475 |
+
[titan] 2026-01-06 21:30:35,519 - root - INFO - [31mstep: 85 [32mloss: 7.5241 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.50 [35mmfu: 46.31%[39m
|
| 476 |
+
[titan] 2026-01-06 21:30:35,520 - root - INFO - [34mlr: 3.3594e-05 gnorm: 8.69 [35m[ 1:13:19<1 day, 18:56:41][39m
|
| 477 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [31mstep: 86 [32mloss: 7.5827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 478 |
+
[titan] 2026-01-06 21:31:17,030 - root - INFO - [34mlr: 3.3984e-05 gnorm: 7.22 [35m[ 1:14:00<1 day, 18:49:53][39m
|
| 479 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [31mstep: 87 [32mloss: 7.5505 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.44 [35mmfu: 46.29%[39m
|
| 480 |
+
[titan] 2026-01-06 21:31:58,543 - root - INFO - [34mlr: 3.4375e-05 gnorm: 7.91 [35m[ 1:14:42<1 day, 18:43:14][39m
|
| 481 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [31mstep: 88 [32mloss: 7.5143 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 482 |
+
[titan] 2026-01-06 21:32:40,071 - root - INFO - [34mlr: 3.4766e-05 gnorm: 8.00 [35m[ 1:15:23<1 day, 18:36:44][39m
|
| 483 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [31mstep: 89 [32mloss: 7.5199 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 484 |
+
[titan] 2026-01-06 21:33:21,599 - root - INFO - [34mlr: 3.5156e-05 gnorm: 8.62 [35m[ 1:16:05<1 day, 18:30:21][39m
|
| 485 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [31mstep: 90 [32mloss: 7.4785 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.40 [35mmfu: 46.28%[39m
|
| 486 |
+
[titan] 2026-01-06 21:34:03,122 - root - INFO - [34mlr: 3.5547e-05 gnorm: 8.12 [35m[ 1:16:47<1 day, 18:24:06][39m
|
| 487 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [31mstep: 91 [32mloss: 7.5003 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 488 |
+
[titan] 2026-01-06 21:34:44,655 - root - INFO - [34mlr: 3.5937e-05 gnorm: 6.97 [35m[ 1:17:28<1 day, 18:17:58][39m
|
| 489 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [31mstep: 92 [32mloss: 7.5113 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 490 |
+
[titan] 2026-01-06 21:35:26,183 - root - INFO - [34mlr: 3.6328e-05 gnorm: 10.19 [35m[ 1:18:10<1 day, 18:11:58][39m
|
| 491 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [31mstep: 93 [32mloss: 7.4875 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 492 |
+
[titan] 2026-01-06 21:36:07,712 - root - INFO - [34mlr: 3.6719e-05 gnorm: 4.59 [35m[ 1:18:51<1 day, 18:06:04][39m
|
| 493 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [31mstep: 94 [32mloss: 7.8691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 494 |
+
[titan] 2026-01-06 21:36:49,202 - root - INFO - [34mlr: 3.7109e-05 gnorm: 86.50 [35m[ 1:19:33<1 day, 18:00:16][39m
|
| 495 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [31mstep: 95 [32mloss: 7.7993 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 496 |
+
[titan] 2026-01-06 21:37:30,710 - root - INFO - [34mlr: 3.7500e-05 gnorm: 62.50 [35m[ 1:20:14<1 day, 17:54:34][39m
|
| 497 |
+
[titan] 2026-01-06 21:38:12,247 - root - INFO - [31mstep: 96 [32mloss: 7.6230 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 498 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - [34mlr: 3.7891e-05 gnorm: 17.38 [35m[ 1:20:56<1 day, 17:49:00][39m
|
| 499 |
+
[titan] 2026-01-06 21:38:12,248 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 500 |
+
[titan] 2026-01-06 21:38:32,938 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 501 |
+
[titan] 2026-01-06 21:38:32,938 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.69 seconds.
|
| 502 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [31mstep: 97 [32mloss: 7.5778 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,057 [36mtflops: 96.68 [35mmfu: 30.99%[39m
|
| 503 |
+
[titan] 2026-01-06 21:39:14,269 - root - INFO - [34mlr: 3.8281e-05 gnorm: 17.75 [35m[ 1:21:58<1 day, 17:54:00][39m
|
| 504 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [31mstep: 98 [32mloss: 7.5438 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 505 |
+
[titan] 2026-01-06 21:39:55,690 - root - INFO - [34mlr: 3.8672e-05 gnorm: 11.75 [35m[ 1:22:39<1 day, 17:48:28][39m
|
| 506 |
+
[titan] 2026-01-06 21:40:37,179 - root - INFO - [31mstep: 99 [32mloss: 7.5091 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,580 [36mtflops: 144.52 [35mmfu: 46.32%[39m
|
| 507 |
+
[titan] 2026-01-06 21:40:37,180 - root - INFO - [34mlr: 3.9063e-05 gnorm: 7.81 [35m[ 1:23:21<1 day, 17:43:04][39m
|
| 508 |
+
[titan] 2026-01-06 21:40:37,202 - root - INFO - [GC] Peforming periodical GC collection. 0.02 seconds.
|
| 509 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [31mstep: 100 [32mloss: 7.4961 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 510 |
+
[titan] 2026-01-06 21:41:18,706 - root - INFO - [34mlr: 3.9453e-05 gnorm: 7.59 [35m[ 1:24:02<1 day, 17:37:46][39m
|
| 511 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [31mstep: 101 [32mloss: 7.4848 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.28%[39m
|
| 512 |
+
[titan] 2026-01-06 21:42:00,228 - root - INFO - [34mlr: 3.9844e-05 gnorm: 5.97 [35m[ 1:24:44<1 day, 17:32:34][39m
|
| 513 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [31mstep: 102 [32mloss: 7.5118 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.45 [35mmfu: 46.30%[39m
|
| 514 |
+
[titan] 2026-01-06 21:42:41,739 - root - INFO - [34mlr: 4.0234e-05 gnorm: 8.06 [35m[ 1:25:25<1 day, 17:27:26][39m
|
| 515 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [31mstep: 103 [32mloss: 7.4788 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 516 |
+
[titan] 2026-01-06 21:43:23,265 - root - INFO - [34mlr: 4.0625e-05 gnorm: 10.06 [35m[ 1:26:07<1 day, 17:22:24][39m
|
| 517 |
+
[titan] 2026-01-06 21:44:04,785 - root - INFO - [31mstep: 104 [32mloss: 7.4560 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.41 [35mmfu: 46.29%[39m
|
| 518 |
+
[titan] 2026-01-06 21:44:04,786 - root - INFO - [34mlr: 4.1016e-05 gnorm: 9.50 [35m[ 1:26:48<1 day, 17:17:27][39m
|
| 519 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [31mstep: 105 [32mloss: 7.4534 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 520 |
+
[titan] 2026-01-06 21:44:46,319 - root - INFO - [34mlr: 4.1406e-05 gnorm: 8.44 [35m[ 1:27:30<1 day, 17:12:36][39m
|
| 521 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [31mstep: 106 [32mloss: 7.4770 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 522 |
+
[titan] 2026-01-06 21:45:27,838 - root - INFO - [34mlr: 4.1797e-05 gnorm: 10.56 [35m[ 1:28:11<1 day, 17:07:48][39m
|
| 523 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [31mstep: 107 [32mloss: 7.4382 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 524 |
+
[titan] 2026-01-06 21:46:09,374 - root - INFO - [34mlr: 4.2188e-05 gnorm: 13.69 [35m[ 1:28:53<1 day, 17:03:06][39m
|
| 525 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [31mstep: 108 [32mloss: 7.4561 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.39 [35mmfu: 46.28%[39m
|
| 526 |
+
[titan] 2026-01-06 21:46:50,902 - root - INFO - [34mlr: 4.2578e-05 gnorm: 8.69 [35m[ 1:29:34<1 day, 16:58:28][39m
|
| 527 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [31mstep: 109 [32mloss: 7.3967 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 528 |
+
[titan] 2026-01-06 21:47:32,443 - root - INFO - [34mlr: 4.2969e-05 gnorm: 7.31 [35m[ 1:30:16<1 day, 16:53:55][39m
|
| 529 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [31mstep: 110 [32mloss: 7.4334 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 530 |
+
[titan] 2026-01-06 21:48:13,976 - root - INFO - [34mlr: 4.3359e-05 gnorm: 25.38 [35m[ 1:30:57<1 day, 16:49:25][39m
|
| 531 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [31mstep: 111 [32mloss: 7.4360 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 532 |
+
[titan] 2026-01-06 21:48:55,511 - root - INFO - [34mlr: 4.3750e-05 gnorm: 10.44 [35m[ 1:31:39<1 day, 16:45:00][39m
|
| 533 |
+
[titan] 2026-01-06 21:49:37,059 - root - INFO - [31mstep: 112 [32mloss: 7.5123 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 534 |
+
[titan] 2026-01-06 21:49:37,060 - root - INFO - [34mlr: 4.4141e-05 gnorm: 16.88 [35m[ 1:32:20<1 day, 16:40:39][39m
|
| 535 |
+
[titan] 2026-01-06 21:49:37,060 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 536 |
+
[titan] 2026-01-06 21:49:59,578 - root - INFO - [GC] GC collection invoked by checkpointer. 0.14 seconds.
|
| 537 |
+
[titan] 2026-01-06 21:49:59,579 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 22.52 seconds.
|
| 538 |
+
[titan] 2026-01-06 21:50:40,891 - root - INFO - [31mstep: 113 [32mloss: 7.4803 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,027 [36mtflops: 93.94 [35mmfu: 30.11%[39m
|
| 539 |
+
[titan] 2026-01-06 21:50:40,892 - root - INFO - [34mlr: 4.4531e-05 gnorm: 13.06 [35m[ 1:33:24<1 day, 16:46:06][39m
|
| 540 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [31mstep: 114 [32mloss: 7.4859 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.79 [35mmfu: 46.41%[39m
|
| 541 |
+
[titan] 2026-01-06 21:51:22,305 - root - INFO - [34mlr: 4.4922e-05 gnorm: 16.50 [35m[ 1:34:06<1 day, 16:41:44][39m
|
| 542 |
+
[titan] 2026-01-06 21:52:03,747 - root - INFO - [31mstep: 115 [32mloss: 7.4151 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.68 [35mmfu: 46.37%[39m
|
| 543 |
+
[titan] 2026-01-06 21:52:03,748 - root - INFO - [34mlr: 4.5313e-05 gnorm: 13.94 [35m[ 1:34:47<1 day, 16:37:26][39m
|
| 544 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [31mstep: 116 [32mloss: 7.3814 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.47 [35mmfu: 46.30%[39m
|
| 545 |
+
[titan] 2026-01-06 21:52:45,252 - root - INFO - [34mlr: 4.5703e-05 gnorm: 11.69 [35m[ 1:35:29<1 day, 16:33:14][39m
|
| 546 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [31mstep: 117 [32mloss: 7.4033 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,579 [36mtflops: 144.46 [35mmfu: 46.30%[39m
|
| 547 |
+
[titan] 2026-01-06 21:53:26,760 - root - INFO - [34mlr: 4.6094e-05 gnorm: 9.31 [35m[ 1:36:10<1 day, 16:29:06][39m
|
| 548 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [31mstep: 118 [32mloss: 7.4721 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 549 |
+
[titan] 2026-01-06 21:54:08,279 - root - INFO - [34mlr: 4.6484e-05 gnorm: 20.88 [35m[ 1:36:52<1 day, 16:25:01][39m
|
| 550 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [31mstep: 119 [32mloss: 7.4258 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 551 |
+
[titan] 2026-01-06 21:54:49,813 - root - INFO - [34mlr: 4.6875e-05 gnorm: 16.62 [35m[ 1:37:33<1 day, 16:21:00][39m
|
| 552 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [31mstep: 120 [32mloss: 7.3951 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 553 |
+
[titan] 2026-01-06 21:55:31,360 - root - INFO - [34mlr: 4.7266e-05 gnorm: 11.38 [35m[ 1:38:15<1 day, 16:17:03][39m
|
| 554 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [31mstep: 121 [32mloss: 7.3984 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 555 |
+
[titan] 2026-01-06 21:56:12,904 - root - INFO - [34mlr: 4.7656e-05 gnorm: 10.19 [35m[ 1:38:56<1 day, 16:13:09][39m
|
| 556 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [31mstep: 122 [32mloss: 7.5098 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 557 |
+
[titan] 2026-01-06 21:56:54,444 - root - INFO - [34mlr: 4.8047e-05 gnorm: 19.38 [35m[ 1:39:38<1 day, 16:09:18][39m
|
| 558 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [31mstep: 123 [32mloss: 7.4071 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 559 |
+
[titan] 2026-01-06 21:57:35,983 - root - INFO - [34mlr: 4.8438e-05 gnorm: 13.25 [35m[ 1:40:19<1 day, 16:05:30][39m
|
| 560 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [31mstep: 124 [32mloss: 7.4271 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 561 |
+
[titan] 2026-01-06 21:58:17,525 - root - INFO - [34mlr: 4.8828e-05 gnorm: 11.88 [35m[ 1:41:01<1 day, 16:01:45][39m
|
| 562 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [31mstep: 125 [32mloss: 7.3603 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.31 [35mmfu: 46.25%[39m
|
| 563 |
+
[titan] 2026-01-06 21:58:59,075 - root - INFO - [34mlr: 4.9219e-05 gnorm: 11.50 [35m[ 1:41:42<1 day, 15:58:03][39m
|
| 564 |
+
[titan] 2026-01-06 21:59:40,618 - root - INFO - [31mstep: 126 [32mloss: 7.3625 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.33 [35mmfu: 46.26%[39m
|
| 565 |
+
[titan] 2026-01-06 21:59:40,619 - root - INFO - [34mlr: 4.9609e-05 gnorm: 9.88 [35m[ 1:42:24<1 day, 15:54:24][39m
|
| 566 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [31mstep: 127 [32mloss: 7.3691 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.36 [35mmfu: 46.27%[39m
|
| 567 |
+
[titan] 2026-01-06 22:00:22,155 - root - INFO - [34mlr: 5.0000e-05 gnorm: 11.88 [35m[ 1:43:06<1 day, 15:50:48][39m
|
| 568 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [31mstep: 128 [32mloss: 7.3331 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.35 [35mmfu: 46.27%[39m
|
| 569 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - [34mlr: 5.0391e-05 gnorm: 11.56 [35m[ 1:43:47<1 day, 15:47:14][39m
|
| 570 |
+
[titan] 2026-01-06 22:01:03,694 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 571 |
+
[titan] 2026-01-06 22:01:24,082 - root - INFO - [GC] GC collection invoked by checkpointer. 0.21 seconds.
|
| 572 |
+
[titan] 2026-01-06 22:01:24,082 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 20.39 seconds.
|
| 573 |
+
[titan] 2026-01-06 22:02:05,453 - root - INFO - [31mstep: 129 [32mloss: 7.2878 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,061 [36mtflops: 97.09 [35mmfu: 31.12%[39m
|
| 574 |
+
[titan] 2026-01-06 22:02:05,454 - root - INFO - [34mlr: 5.0781e-05 gnorm: 6.16 [35m[ 1:44:49<1 day, 15:51:24][39m
|
| 575 |
+
[titan] 2026-01-06 22:02:46,875 - root - INFO - [31mstep: 130 [32mloss: 7.7017 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.76 [35mmfu: 46.40%[39m
|
| 576 |
+
[titan] 2026-01-06 22:02:46,876 - root - INFO - [34mlr: 5.1172e-05 gnorm: 70.00 [35m[ 1:45:30<1 day, 15:47:50][39m
|
| 577 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [31mstep: 131 [32mloss: 7.5220 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.61 [35mmfu: 46.35%[39m
|
| 578 |
+
[titan] 2026-01-06 22:03:28,339 - root - INFO - [34mlr: 5.1562e-05 gnorm: 44.75 [35m[ 1:46:12<1 day, 15:44:19][39m
|
| 579 |
+
[titan] 2026-01-06 22:04:09,858 - root - INFO - [31mstep: 132 [32mloss: 7.4566 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.42 [35mmfu: 46.29%[39m
|
| 580 |
+
[titan] 2026-01-06 22:04:09,859 - root - INFO - [34mlr: 5.1953e-05 gnorm: 13.50 [35m[ 1:46:53<1 day, 15:40:51][39m
|
| 581 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [31mstep: 133 [32mloss: 7.4026 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.38 [35mmfu: 46.28%[39m
|
| 582 |
+
[titan] 2026-01-06 22:04:51,387 - root - INFO - [34mlr: 5.2344e-05 gnorm: 10.12 [35m[ 1:47:35<1 day, 15:37:27][39m
|
| 583 |
+
[titan] 2026-01-06 22:05:32,919 - root - INFO - [31mstep: 134 [32mloss: 7.4092 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 584 |
+
[titan] 2026-01-06 22:05:32,920 - root - INFO - [34mlr: 5.2734e-05 gnorm: 14.88 [35m[ 1:48:16<1 day, 15:34:04][39m
|
| 585 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [31mstep: 135 [32mloss: 7.3827 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 586 |
+
[titan] 2026-01-06 22:06:14,471 - root - INFO - [34mlr: 5.3125e-05 gnorm: 18.88 [35m[ 1:48:58<1 day, 15:30:45][39m
|
| 587 |
+
[titan] 2026-01-06 22:06:56,027 - root - INFO - [31mstep: 136 [32mloss: 7.4021 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 588 |
+
[titan] 2026-01-06 22:06:56,028 - root - INFO - [34mlr: 5.3516e-05 gnorm: 12.81 [35m[ 1:49:39<1 day, 15:27:28][39m
|
| 589 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [31mstep: 137 [32mloss: 7.4064 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 590 |
+
[titan] 2026-01-06 22:07:37,581 - root - INFO - [34mlr: 5.3906e-05 gnorm: 7.19 [35m[ 1:50:21<1 day, 15:24:14][39m
|
| 591 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [31mstep: 138 [32mloss: 7.4774 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.32 [35mmfu: 46.26%[39m
|
| 592 |
+
[titan] 2026-01-06 22:08:19,129 - root - INFO - [34mlr: 5.4297e-05 gnorm: 22.62 [35m[ 1:51:03<1 day, 15:21:01][39m
|
| 593 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [31mstep: 139 [32mloss: 7.4281 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 594 |
+
[titan] 2026-01-06 22:09:00,688 - root - INFO - [34mlr: 5.4688e-05 gnorm: 11.00 [35m[ 1:51:44<1 day, 15:17:51][39m
|
| 595 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [31mstep: 140 [32mloss: 7.5633 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 596 |
+
[titan] 2026-01-06 22:09:42,228 - root - INFO - [34mlr: 5.5078e-05 gnorm: 19.75 [35m[ 1:52:26<1 day, 15:14:42][39m
|
| 597 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [31mstep: 141 [32mloss: 7.5423 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.27 [35mmfu: 46.24%[39m
|
| 598 |
+
[titan] 2026-01-06 22:10:23,790 - root - INFO - [34mlr: 5.5469e-05 gnorm: 17.25 [35m[ 1:53:07<1 day, 15:11:37][39m
|
| 599 |
+
[titan] 2026-01-06 22:11:05,350 - root - INFO - [31mstep: 142 [32mloss: 7.4047 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.28 [35mmfu: 46.24%[39m
|
| 600 |
+
[titan] 2026-01-06 22:11:05,351 - root - INFO - [34mlr: 5.5859e-05 gnorm: 9.94 [35m[ 1:53:49<1 day, 15:08:33][39m
|
| 601 |
+
[titan] 2026-01-06 22:11:46,904 - root - INFO - [31mstep: 143 [32mloss: 7.5261 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.30 [35mmfu: 46.25%[39m
|
| 602 |
+
[titan] 2026-01-06 22:11:46,905 - root - INFO - [34mlr: 5.6250e-05 gnorm: 25.75 [35m[ 1:54:30<1 day, 15:05:31][39m
|
| 603 |
+
[titan] 2026-01-06 22:12:28,460 - root - INFO - [31mstep: 144 [32mloss: 7.4217 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.29 [35mmfu: 46.25%[39m
|
| 604 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - [34mlr: 5.6641e-05 gnorm: 18.00 [35m[ 1:55:12<1 day, 15:02:31][39m
|
| 605 |
+
[titan] 2026-01-06 22:12:28,461 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 606 |
+
[titan] 2026-01-06 22:12:50,183 - root - INFO - [GC] GC collection invoked by checkpointer. 0.19 seconds.
|
| 607 |
+
[titan] 2026-01-06 22:12:50,183 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 21.72 seconds.
|
| 608 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [31mstep: 145 [32mloss: 7.3958 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,039 [36mtflops: 95.10 [35mmfu: 30.48%[39m
|
| 609 |
+
[titan] 2026-01-06 22:13:31,510 - root - INFO - [34mlr: 5.7031e-05 gnorm: 11.69 [35m[ 1:56:15<1 day, 15:06:46][39m
|
| 610 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [31mstep: 146 [32mloss: 7.4073 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.71 [35mmfu: 46.38%[39m
|
| 611 |
+
[titan] 2026-01-06 22:14:12,944 - root - INFO - [34mlr: 5.7422e-05 gnorm: 11.25 [35m[ 1:56:56<1 day, 15:03:44][39m
|
| 612 |
+
[titan] 2026-01-06 22:14:54,370 - root - INFO - [31mstep: 147 [32mloss: 7.3301 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,582 [36mtflops: 144.74 [35mmfu: 46.39%[39m
|
| 613 |
+
[titan] 2026-01-06 22:14:54,371 - root - INFO - [34mlr: 5.7813e-05 gnorm: 7.34 [35m[ 1:57:38<1 day, 15:00:44][39m
|
| 614 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [31mstep: 148 [32mloss: 7.3624 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,581 [36mtflops: 144.64 [35mmfu: 46.36%[39m
|
| 615 |
+
[titan] 2026-01-06 22:15:35,825 - root - INFO - [34mlr: 5.8203e-05 gnorm: 17.38 [35m[ 1:58:19<1 day, 14:57:47][39m
|
| 616 |
+
[titan] 2026-01-06 22:16:17,356 - root - INFO - [31mstep: 149 [32mloss: 7.2913 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,578 [36mtflops: 144.37 [35mmfu: 46.27%[39m
|
| 617 |
+
[titan] 2026-01-06 22:16:17,357 - root - INFO - [34mlr: 5.8594e-05 gnorm: 3.80 [35m[ 1:59:01<1 day, 14:54:52][39m
|
| 618 |
+
[titan] 2026-01-06 22:16:17,388 - root - INFO - [GC] Peforming periodical GC collection. 0.03 seconds.
|
| 619 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [31mstep: 150 [32mloss: 7.3146 [33mmemory: 71.94GiB(90.77%) [34mtps: 1,577 [36mtflops: 144.25 [35mmfu: 46.23%[39m
|
| 620 |
+
[titan] 2026-01-06 22:16:58,923 - root - INFO - [34mlr: 5.8984e-05 gnorm: 7.06 [35m[ 1:59:42<1 day, 14:52:01][39m
|
logs/none_4cvjdbqa/attempt_0/7/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,365 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,367 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,369 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,369 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,369 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,424 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,765 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,765 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,765 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,326 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,479 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,534 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,534 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,676 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,180 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,181 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,332 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,332 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:18,265 - root - ERROR - Failed to create WandB logger: No API key configured. Use `wandb login` to log in.
|
| 275 |
+
[titan] 2026-01-02 12:24:18,271 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 276 |
+
[titan] 2026-01-02 12:24:18,274 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 277 |
+
[titan] 2026-01-02 12:24:18,506 - root - INFO - Mixed precision training is handled by fully_shard
|
| 278 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [31m***** Running training *****[39m
|
| 279 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Training starts at step 1
|
| 280 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 281 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 282 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 283 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 286 |
+
[titan] 2026-01-02 12:24:18,507 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 287 |
+
[titan] 2026-01-02 12:24:18,508 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 288 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 289 |
+
torch._dynamo.utils.warn_once(msg)
|
| 290 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 291 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 292 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 293 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 157 [36mtflops: 14.35 [35mmfu: 4.60%[39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:06:57<148 days, 12:29:41][39m
|
| 296 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 297 |
+
[titan] 2026-01-02 12:31:45,774 - root - INFO - [GC] GC collection invoked by checkpointer. 0.40 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,775 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.77 seconds.
|
| 299 |
+
[titan] 2026-01-02 12:31:45,775 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 300 |
+
[titan] 2026-01-02 12:32:27,287 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.12GiB(87.21%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 301 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:09<86 days, 22:18:58][39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 303 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:50<62 days, 20:46:52][39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 305 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:32<50 days, 20:09:32][39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 307 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:13<43 days, 15:04:36][39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.18%[39m
|
| 309 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:10:55<38 days, 19:42:14][39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 311 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:36<35 days, 9:20:01][39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 313 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:18<32 days, 19:32:37][39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 315 |
+
[titan] 2026-01-02 12:37:18,456 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:00<30 days, 19:30:24][39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 317 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:41<29 days, 5:04:02][39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 319 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:23<27 days, 21:38:05][39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 321 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:05<26 days, 19:25:26][39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.11%[39m
|
| 323 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:46<25 days, 21:15:36][39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 325 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:28<25 days, 2:14:58][39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 327 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:10<24 days, 9:46:51][39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 329 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:51<23 days, 19:21:25][39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 331 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:33<23 days, 6:38:27][39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.12GiB(87.21%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 333 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:15<22 days, 19:19:10][39m
|
logs/none_rci5peh0/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,075 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,075 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,076 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,372 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,376 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,378 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,378 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,378 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,424 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,768 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,768 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,768 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,480 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,537 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,537 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,538 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,682 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,137 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,184 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,184 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,331 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,349 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,553 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,553 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,553 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,553 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,554 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:34:10][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:45,818 - root - INFO - [GC] GC collection invoked by checkpointer. 0.44 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:45,818 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.82 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,819 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,287 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:10][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:16][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:33][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:23][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:32:51][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:14][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:32][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:07][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:22][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:12][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:42][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.10%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:09][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:03][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:02][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:50][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:50][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:35:57][39m
|
logs/none_rci5peh0/attempt_0/1/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,343 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,356 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,419 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,761 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,761 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,761 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,479 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,536 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,133 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,180 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,181 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,348 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,564 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,566 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:34:19][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:45,830 - root - INFO - [GC] GC collection invoked by checkpointer. 0.45 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:45,830 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.83 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,830 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,287 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:14][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:19][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:35][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:25][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:32:53][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:17][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:35][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:10][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:25][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:16][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:45][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.11%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:11][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:05][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:04][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:52][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,816 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:52][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:36:00][39m
|
logs/none_rci5peh0/attempt_0/2/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,356 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,358 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,360 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,360 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,360 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,419 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,765 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,765 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,765 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,326 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,479 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,536 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,181 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,182 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,348 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,564 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,565 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:34:11][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:45,805 - root - INFO - [GC] GC collection invoked by checkpointer. 0.42 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:45,806 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.80 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,806 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:11][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:17][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:33][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:23][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:32:52][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:15][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:33][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,456 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:08][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:23][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:13][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:43][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.10%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:10][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:03][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:02][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:50][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:50][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:35:58][39m
|
logs/none_rci5peh0/attempt_0/3/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,074 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,345 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,419 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,768 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,768 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,768 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,478 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,535 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,536 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,182 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,182 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,331 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,349 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,560 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,561 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,562 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:34:08][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:45,811 - root - INFO - [GC] GC collection invoked by checkpointer. 0.43 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:45,811 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.81 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,811 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:08][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:15][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:32][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:21][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:32:50][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:13][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:31][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,456 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:05][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:21][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:11][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:40][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.11%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:07][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:00][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:00][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:48][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,816 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:47][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:35:55][39m
|
logs/none_rci5peh0/attempt_0/4/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,074 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,074 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,075 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,342 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,421 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,761 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,761 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,761 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,327 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,480 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,540 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,540 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,541 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,681 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,136 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,183 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,183 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,348 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,534 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,535 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:34:27][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:45,842 - root - INFO - [GC] GC collection invoked by checkpointer. 0.46 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:45,842 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.84 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,842 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,287 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:19][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:22][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:38][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:26][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:32:54][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:18][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:35][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,456 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:10][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:25][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:15][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:44][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.10%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:11][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:04][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:04][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:52][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:52][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:35:59][39m
|
logs/none_rci5peh0/attempt_0/5/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,074 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,360 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,363 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,365 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,365 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,366 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,423 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,777 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,777 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,777 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,322 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,323 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,325 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,326 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,481 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,538 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,538 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,539 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,678 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,136 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,182 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,183 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,331 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,349 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,351 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,619 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,620 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,002 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:34:08][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:46,019 - root - INFO - [GC] GC collection invoked by checkpointer. 0.63 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:46,019 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 30.02 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:46,020 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:12][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:18][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:33][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:22][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:32:51][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:15][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:33][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:08][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,110 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:23][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:14][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,449 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:44][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.11%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:10][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:04][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:03][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:51][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:51][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:35:58][39m
|
logs/none_rci5peh0/attempt_0/6/stdout.log
ADDED
|
File without changes
|
logs/none_rci5peh0/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2026-01-02 12:21:12,073 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "float32",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"initial_load_model_weights_only": true,
|
| 18 |
+
"initial_load_path": null,
|
| 19 |
+
"interval": 3072,
|
| 20 |
+
"interval_type": "steps",
|
| 21 |
+
"keep_latest_k": 0,
|
| 22 |
+
"last_save_model_weights_only": false,
|
| 23 |
+
"load_step": -1,
|
| 24 |
+
"model_weights_only": false
|
| 25 |
+
},
|
| 26 |
+
"comm": {
|
| 27 |
+
"init_timeout_seconds": 300,
|
| 28 |
+
"trace_buf_size": 20000,
|
| 29 |
+
"train_timeout_seconds": 100
|
| 30 |
+
},
|
| 31 |
+
"experimental": {
|
| 32 |
+
"context_parallel_degree": 1,
|
| 33 |
+
"context_parallel_rotate_method": "allgather",
|
| 34 |
+
"custom_model_path": "",
|
| 35 |
+
"enable_async_tensor_parallel": false,
|
| 36 |
+
"enable_compiled_autograd": false,
|
| 37 |
+
"pipeline_parallel_degree": 1,
|
| 38 |
+
"pipeline_parallel_microbatches": null,
|
| 39 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 40 |
+
"pipeline_parallel_schedule_csv": "",
|
| 41 |
+
"pipeline_parallel_split_points": []
|
| 42 |
+
},
|
| 43 |
+
"fault_tolerance": {
|
| 44 |
+
"enable": false,
|
| 45 |
+
"group_size": 0,
|
| 46 |
+
"min_replica_size": 1,
|
| 47 |
+
"replica_id": 0
|
| 48 |
+
},
|
| 49 |
+
"float8": {
|
| 50 |
+
"enable_fsdp_float8_all_gather": false,
|
| 51 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 52 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 53 |
+
"recipe_name": null
|
| 54 |
+
},
|
| 55 |
+
"job": {
|
| 56 |
+
"config_file": "flame/models/fla.toml",
|
| 57 |
+
"description": "default job",
|
| 58 |
+
"dump_folder": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B",
|
| 59 |
+
"print_args": true,
|
| 60 |
+
"use_for_integration_test": false
|
| 61 |
+
},
|
| 62 |
+
"lr_scheduler": {
|
| 63 |
+
"decay_ratio": null,
|
| 64 |
+
"decay_type": "cosine",
|
| 65 |
+
"lr_min": 0.1,
|
| 66 |
+
"warmup_steps": 1024
|
| 67 |
+
},
|
| 68 |
+
"memory_estimation": {
|
| 69 |
+
"disable_fake_mode": false,
|
| 70 |
+
"enabled": false
|
| 71 |
+
},
|
| 72 |
+
"metrics": {
|
| 73 |
+
"disable_color_printing": false,
|
| 74 |
+
"enable_tensorboard": false,
|
| 75 |
+
"enable_wandb": true,
|
| 76 |
+
"log_freq": 1,
|
| 77 |
+
"save_for_all_ranks": false,
|
| 78 |
+
"save_tb_folder": "tb"
|
| 79 |
+
},
|
| 80 |
+
"model": {
|
| 81 |
+
"config": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json",
|
| 82 |
+
"converters": [],
|
| 83 |
+
"name": "fla",
|
| 84 |
+
"print_after_conversion": false,
|
| 85 |
+
"tokenizer_path": "/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B"
|
| 86 |
+
},
|
| 87 |
+
"optimizer": {
|
| 88 |
+
"beta1": 0.9,
|
| 89 |
+
"beta2": 0.95,
|
| 90 |
+
"early_step_in_backward": false,
|
| 91 |
+
"eps": 1e-15,
|
| 92 |
+
"implementation": "fused",
|
| 93 |
+
"lr": 0.0004,
|
| 94 |
+
"name": "AdamW",
|
| 95 |
+
"weight_decay": 0.1
|
| 96 |
+
},
|
| 97 |
+
"profiling": {
|
| 98 |
+
"enable_memory_snapshot": false,
|
| 99 |
+
"enable_profiling": true,
|
| 100 |
+
"profile_freq": 512,
|
| 101 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 102 |
+
"save_traces_folder": "profile_trace"
|
| 103 |
+
},
|
| 104 |
+
"training": {
|
| 105 |
+
"batch_size": 2,
|
| 106 |
+
"compile": true,
|
| 107 |
+
"context_len": 2048,
|
| 108 |
+
"data_dir": null,
|
| 109 |
+
"data_files": null,
|
| 110 |
+
"data_parallel_replicate_degree": 1,
|
| 111 |
+
"data_parallel_shard_degree": 8,
|
| 112 |
+
"data_probs": null,
|
| 113 |
+
"dataset": "/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu",
|
| 114 |
+
"dataset_name": "default",
|
| 115 |
+
"dataset_split": "train",
|
| 116 |
+
"deterministic": false,
|
| 117 |
+
"disable_loss_parallel": true,
|
| 118 |
+
"enable_cpu_offload": false,
|
| 119 |
+
"fsdp_reshard_after_forward": "default",
|
| 120 |
+
"gc_freq": 50,
|
| 121 |
+
"gradient_accumulation_steps": 16,
|
| 122 |
+
"max_norm": 1.0,
|
| 123 |
+
"mixed_precision_param": "bfloat16",
|
| 124 |
+
"mixed_precision_reduce": "float32",
|
| 125 |
+
"num_workers": 8,
|
| 126 |
+
"persistent_workers": false,
|
| 127 |
+
"pin_memory": false,
|
| 128 |
+
"prefetch_factor": 2,
|
| 129 |
+
"seed": 42,
|
| 130 |
+
"seq_len": 2048,
|
| 131 |
+
"skip_nan_inf": true,
|
| 132 |
+
"steps": 30720,
|
| 133 |
+
"streaming": true,
|
| 134 |
+
"tensor_parallel_degree": 1,
|
| 135 |
+
"varlen": false
|
| 136 |
+
}
|
| 137 |
+
}[39m
|
| 138 |
+
[titan] 2026-01-02 12:21:12,074 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 139 |
+
[titan] 2026-01-02 12:21:13,346 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 140 |
+
[titan] 2026-01-02 12:21:13,353 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 141 |
+
[titan] 2026-01-02 12:21:13,355 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 142 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 143 |
+
[titan] 2026-01-02 12:21:13,355 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 144 |
+
[titan] 2026-01-02 12:21:13,423 - root - INFO - Loading tokenizer...
|
| 145 |
+
The tokenizer you are loading from '/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
|
| 146 |
+
[titan] 2026-01-02 12:21:13,766 - root - INFO - Qwen2TokenizerFast(name_or_path='/mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B', vocab_size=151643, model_max_length=10000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 147 |
+
151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 148 |
+
151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 149 |
+
151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 150 |
+
151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 151 |
+
151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 152 |
+
151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 153 |
+
151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 154 |
+
151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 155 |
+
151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 156 |
+
151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 157 |
+
151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 158 |
+
151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 159 |
+
151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 160 |
+
151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 161 |
+
151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 162 |
+
151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 163 |
+
151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 164 |
+
151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 165 |
+
151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 166 |
+
151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 167 |
+
151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 168 |
+
151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 169 |
+
151665: AddedToken("<tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 170 |
+
151666: AddedToken("</tool_response>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 171 |
+
151667: AddedToken("<think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 172 |
+
151668: AddedToken("</think>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
[titan] 2026-01-02 12:21:13,766 - root - INFO - Loading dataset /mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu:default
|
| 176 |
+
`trust_remote_code` is not supported anymore.
|
| 177 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 178 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 179 |
+
[titan] 2026-01-02 12:21:13,766 - datasets.load - ERROR - `trust_remote_code` is not supported anymore.
|
| 180 |
+
Please check that the Hugging Face dataset '/mnt/scratch/share/datasets/HuggingFaceFW___fineweb-edu' isn't based on a loading script and remove `trust_remote_code`.
|
| 181 |
+
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
|
| 182 |
+
[titan] 2026-01-02 12:21:14,324 - root - INFO - Shuffling the dataset with seed 42
|
| 183 |
+
[titan] 2026-01-02 12:21:14,326 - root - INFO - IterableDataset({
|
| 184 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 185 |
+
num_shards: 360
|
| 186 |
+
})
|
| 187 |
+
[titan] 2026-01-02 12:21:14,326 - root - INFO - Building dataloader...
|
| 188 |
+
[titan] 2026-01-02 12:21:14,328 - root - INFO - Loading model config from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/config.json
|
| 189 |
+
[titan] 2026-01-02 12:21:14,329 - root - INFO - Building model from the config
|
| 190 |
+
[32mGSAConfig {
|
| 191 |
+
"architectures": [
|
| 192 |
+
"GSAForCausalLM"
|
| 193 |
+
],
|
| 194 |
+
"attn": null,
|
| 195 |
+
"bos_token_id": 151643,
|
| 196 |
+
"clamp_max": null,
|
| 197 |
+
"clamp_min": null,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"dtype": "bfloat16",
|
| 200 |
+
"elementwise_affine": false,
|
| 201 |
+
"eos_token_id": 151645,
|
| 202 |
+
"expand_k": 1,
|
| 203 |
+
"expand_v": 1,
|
| 204 |
+
"feature_map": "swish",
|
| 205 |
+
"fuse_cross_entropy": true,
|
| 206 |
+
"fuse_linear_cross_entropy": false,
|
| 207 |
+
"fuse_norm": true,
|
| 208 |
+
"fuse_swiglu": true,
|
| 209 |
+
"gate_logit_normalizer": 8,
|
| 210 |
+
"hidden_act": "swish",
|
| 211 |
+
"hidden_ratio": 4,
|
| 212 |
+
"hidden_size": 5120,
|
| 213 |
+
"initializer_range": 0.02,
|
| 214 |
+
"intermediate_size": 17408,
|
| 215 |
+
"max_position_embeddings": 40960,
|
| 216 |
+
"model_type": "gsa",
|
| 217 |
+
"norm_eps": 1e-06,
|
| 218 |
+
"num_heads": 40,
|
| 219 |
+
"num_hidden_layers": 40,
|
| 220 |
+
"num_kv_heads": 8,
|
| 221 |
+
"num_slots": 256,
|
| 222 |
+
"rope_theta": 1000000,
|
| 223 |
+
"share_conv_kernel": true,
|
| 224 |
+
"tie_word_embeddings": true,
|
| 225 |
+
"transformers_version": "4.57.3",
|
| 226 |
+
"use_cache": true,
|
| 227 |
+
"use_l2warp": false,
|
| 228 |
+
"use_norm": true,
|
| 229 |
+
"use_output_gate": true,
|
| 230 |
+
"use_rope": false,
|
| 231 |
+
"use_short_conv": false,
|
| 232 |
+
"vocab_size": 151936
|
| 233 |
+
}
|
| 234 |
+
[39m
|
| 235 |
+
[titan] 2026-01-02 12:21:14,481 - root - INFO - [34m
|
| 236 |
+
GSAForCausalLM(
|
| 237 |
+
(model): GSAModel(
|
| 238 |
+
(embeddings): Embedding(151936, 5120)
|
| 239 |
+
(layers): ModuleList(
|
| 240 |
+
(0-39): 40 x GSABlock(
|
| 241 |
+
(attn_norm): RMSNorm(5120, eps=1e-06)
|
| 242 |
+
(attn): GatedSlotAttention(
|
| 243 |
+
(feature_map): SwishFeatureMap()
|
| 244 |
+
(q_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 245 |
+
(k_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 246 |
+
(v_proj): Linear(in_features=5120, out_features=1024, bias=False)
|
| 247 |
+
(f_proj): Linear(in_features=5120, out_features=2048, bias=False)
|
| 248 |
+
(g_norm): RMSNorm(5120, elementwise_affine=False, eps=1e-06)
|
| 249 |
+
(o_proj): Linear(in_features=5120, out_features=5120, bias=False)
|
| 250 |
+
)
|
| 251 |
+
(mlp_norm): RMSNorm(5120, eps=1e-06)
|
| 252 |
+
(mlp): GatedMLP(
|
| 253 |
+
(gate_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 254 |
+
(up_proj): Linear(in_features=5120, out_features=17408, bias=False)
|
| 255 |
+
(down_proj): Linear(in_features=17408, out_features=5120, bias=False)
|
| 256 |
+
(swiglu_linear): SwiGLULinear()
|
| 257 |
+
)
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(norm): RMSNorm(5120, eps=1e-06)
|
| 261 |
+
)
|
| 262 |
+
(lm_head): Linear(in_features=5120, out_features=151936, bias=False)
|
| 263 |
+
)[39m
|
| 264 |
+
|
| 265 |
+
[titan] 2026-01-02 12:21:14,539 - root - INFO - Compiling each block with torch.compile
|
| 266 |
+
[titan] 2026-01-02 12:21:14,539 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 267 |
+
[titan] 2026-01-02 12:21:14,540 - root - INFO - Compiling the entire model with torch.compile
|
| 268 |
+
[titan] 2026-01-02 12:21:14,677 - root - INFO - Applied FSDP to the model
|
| 269 |
+
[titan] 2026-01-02 12:21:15,135 - root - INFO - CUDA memory usage for model: 3.56GiB(4.49%)
|
| 270 |
+
[titan] 2026-01-02 12:21:15,182 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint
|
| 271 |
+
[titan] 2026-01-02 12:21:15,182 - root - INFO - Loading the checkpoint from /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/checkpoint/step-0.
|
| 272 |
+
[titan] 2026-01-02 12:24:11,329 - root - INFO - [GC] GC collection for checkpoint loading. 0.01 seconds.
|
| 273 |
+
[titan] 2026-01-02 12:24:11,330 - root - INFO - Finished loading the checkpoint in 176.15 seconds.
|
| 274 |
+
[titan] 2026-01-02 12:24:11,346 - root - INFO - CUDA capacity: NVIDIA A100-SXM4-80GB with 79.25GiB memory
|
| 275 |
+
[titan] 2026-01-02 12:24:11,348 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name
|
| 276 |
+
[titan] 2026-01-02 12:24:11,536 - root - INFO - Mixed precision training is handled by fully_shard
|
| 277 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [31m***** Running training *****[39m
|
| 278 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Training starts at step 1
|
| 279 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Number of tokens per sequence = 2,048
|
| 280 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Gradient Accumulation steps = 16
|
| 281 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Instantaneous batch size (per device) = 2
|
| 282 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 256 (524,288 tokens)
|
| 283 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Total optimization steps = 30,720 (16,106,127,360 tokens)
|
| 284 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Warmup steps = 1,024 (536,870,912 tokens)
|
| 285 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - [32m Number of parameters = 14,409,815,040 [39m
|
| 286 |
+
[titan] 2026-01-02 12:24:11,537 - root - INFO - Profiling active. Traces will be saved at /mnt/scratch/home/carlos/leman-labs/checkpoints/GayQwen3-14B/profile_trace
|
| 287 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1692: UserWarning: Dynamo detected a call to a `functools.lru_cache`-wrapped function. Dynamo ignores the cache wrapper and directly traces the wrapped function. Silent incorrectness is only a *potential* risk, not something we have observed. Enable TORCH_LOGS="+dynamo" for a DEBUG stack trace.
|
| 288 |
+
torch._dynamo.utils.warn_once(msg)
|
| 289 |
+
/mnt/scratch/home/carlos/leman-labs/flame/.venv/lib/python3.13/site-packages/torch/_dynamo/variables/functions.py:1598: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 290 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 291 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 292 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 293 |
+
[titan] 2026-01-02 12:31:16,002 - root - INFO - [31mstep: 1 [32mloss: 14.3857 [33mmemory: 65.22GiB(82.29%) [34mtps: 154 [36mtflops: 14.12 [35mmfu: 4.53%[39m
|
| 294 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - [34mlr: 7.8125e-07 gnorm: 129.00 [35m[ 0:07:04<150 days, 23:35:39][39m
|
| 295 |
+
[titan] 2026-01-02 12:31:16,003 - root - INFO - Saving the checkpoint (or staging if async is enabled).
|
| 296 |
+
[titan] 2026-01-02 12:31:45,855 - root - INFO - [GC] GC collection invoked by checkpointer. 0.48 seconds.
|
| 297 |
+
[titan] 2026-01-02 12:31:45,855 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 29.85 seconds.
|
| 298 |
+
[titan] 2026-01-02 12:31:45,855 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
|
| 299 |
+
[titan] 2026-01-02 12:32:27,287 - root - INFO - [31mstep: 2 [32mloss: 14.3989 [33mmemory: 69.11GiB(87.20%) [34mtps: 919 [36mtflops: 84.11 [35mmfu: 26.96%[39m
|
| 300 |
+
[titan] 2026-01-02 12:32:27,288 - root - INFO - [34mlr: 1.1719e-06 gnorm: 127.00 [35m[ 0:08:15<88 days, 3:51:56][39m
|
| 301 |
+
[titan] 2026-01-02 12:33:08,758 - root - INFO - [31mstep: 3 [32mloss: 14.3929 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,580 [36mtflops: 144.59 [35mmfu: 46.34%[39m
|
| 302 |
+
[titan] 2026-01-02 12:33:08,759 - root - INFO - [34mlr: 1.5625e-06 gnorm: 126.00 [35m[ 0:08:57<63 days, 16:28:48][39m
|
| 303 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [31mstep: 4 [32mloss: 14.2932 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,578 [36mtflops: 144.34 [35mmfu: 46.26%[39m
|
| 304 |
+
[titan] 2026-01-02 12:33:50,300 - root - INFO - [34mlr: 1.9531e-06 gnorm: 128.00 [35m[ 0:09:38<51 days, 10:55:57][39m
|
| 305 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [31mstep: 5 [32mloss: 14.2689 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.14 [35mmfu: 46.20%[39m
|
| 306 |
+
[titan] 2026-01-02 12:34:31,898 - root - INFO - [34mlr: 2.3438e-06 gnorm: 124.00 [35m[ 0:10:20<44 days, 2:53:42][39m
|
| 307 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [31mstep: 6 [32mloss: 13.9979 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,575 [36mtflops: 144.10 [35mmfu: 46.19%[39m
|
| 308 |
+
[titan] 2026-01-02 12:35:13,509 - root - INFO - [34mlr: 2.7344e-06 gnorm: 117.00 [35m[ 0:11:02<39 days, 5:33:08][39m
|
| 309 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [31mstep: 7 [32mloss: 13.8167 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 310 |
+
[titan] 2026-01-02 12:35:55,155 - root - INFO - [34mlr: 3.1250e-06 gnorm: 113.00 [35m[ 0:11:43<35 days, 17:46:29][39m
|
| 311 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [31mstep: 8 [32mloss: 13.5683 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 144.01 [35mmfu: 46.16%[39m
|
| 312 |
+
[titan] 2026-01-02 12:36:36,792 - root - INFO - [34mlr: 3.5156e-06 gnorm: 106.50 [35m[ 0:12:25<33 days, 2:55:45][39m
|
| 313 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [31mstep: 9 [32mloss: 13.3760 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.92 [35mmfu: 46.13%[39m
|
| 314 |
+
[titan] 2026-01-02 12:37:18,455 - root - INFO - [34mlr: 3.9063e-06 gnorm: 101.00 [35m[ 0:13:07<31 days, 2:04:18][39m
|
| 315 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [31mstep: 10 [32mloss: 13.1097 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.95 [35mmfu: 46.14%[39m
|
| 316 |
+
[titan] 2026-01-02 12:38:00,109 - root - INFO - [34mlr: 4.2969e-06 gnorm: 94.50 [35m[ 0:13:48<29 days, 10:58:33][39m
|
| 317 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [31mstep: 11 [32mloss: 12.5536 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 318 |
+
[titan] 2026-01-02 12:38:41,790 - root - INFO - [34mlr: 4.6875e-06 gnorm: 82.00 [35m[ 0:14:30<28 days, 3:00:22][39m
|
| 319 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [31mstep: 12 [32mloss: 12.0247 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.93 [35mmfu: 46.13%[39m
|
| 320 |
+
[titan] 2026-01-02 12:39:23,448 - root - INFO - [34mlr: 5.0781e-06 gnorm: 71.50 [35m[ 0:15:12<27 days, 0:20:52][39m
|
| 321 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [31mstep: 13 [32mloss: 11.6076 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.85 [35mmfu: 46.10%[39m
|
| 322 |
+
[titan] 2026-01-02 12:40:05,132 - root - INFO - [34mlr: 5.4687e-06 gnorm: 68.50 [35m[ 0:15:53<26 days, 1:48:18][39m
|
| 323 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [31mstep: 14 [32mloss: 11.2488 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.91 [35mmfu: 46.13%[39m
|
| 324 |
+
[titan] 2026-01-02 12:40:46,797 - root - INFO - [34mlr: 5.8594e-06 gnorm: 63.75 [35m[ 0:16:35<25 days, 6:28:11][39m
|
| 325 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [31mstep: 15 [32mloss: 10.9254 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 326 |
+
[titan] 2026-01-02 12:41:28,477 - root - INFO - [34mlr: 6.2500e-06 gnorm: 55.50 [35m[ 0:17:17<24 days, 13:43:10][39m
|
| 327 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [31mstep: 16 [32mloss: 10.6961 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,573 [36mtflops: 143.94 [35mmfu: 46.13%[39m
|
| 328 |
+
[titan] 2026-01-02 12:42:10,134 - root - INFO - [34mlr: 6.6406e-06 gnorm: 56.50 [35m[ 0:17:58<23 days, 23:02:58][39m
|
| 329 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [31mstep: 17 [32mloss: 10.3915 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,572 [36mtflops: 143.86 [35mmfu: 46.11%[39m
|
| 330 |
+
[titan] 2026-01-02 12:42:51,815 - root - INFO - [34mlr: 7.0313e-06 gnorm: 42.75 [35m[ 0:18:40<23 days, 10:06:57][39m
|
| 331 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [31mstep: 18 [32mloss: 10.1740 [33mmemory: 69.11GiB(87.20%) [34mtps: 1,574 [36mtflops: 143.98 [35mmfu: 46.15%[39m
|
| 332 |
+
[titan] 2026-01-02 12:43:33,461 - root - INFO - [34mlr: 7.4219e-06 gnorm: 32.75 [35m[ 0:19:22<22 days, 22:36:05][39m
|
logs/none_rci5peh0/attempt_0/7/stdout.log
ADDED
|
File without changes
|