WIP
Browse files- first_run_output/manual_launch.sh +11 -0
- first_run_output/mlperf.log +217 -0
- first_run_output/nemo_experiments/default/2025-08-17_22-50-13/nemo_log_globalrank-0_localrank-0.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-1_localrank-1.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-2_localrank-2.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-3_localrank-3.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-4_localrank-4.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-5_localrank-5.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-6_localrank-6.txt +0 -0
- first_run_output/nemo_experiments/default/nemo_log_globalrank-7_localrank-7.txt +0 -0
- first_run_output/outputs/2025-08-17/22-50-12/.hydra/config.yaml +67 -0
- first_run_output/outputs/2025-08-17/22-50-12/.hydra/hydra.yaml +155 -0
- first_run_output/outputs/2025-08-17/22-50-12/.hydra/overrides.yaml +1 -0
- first_run_output/outputs/2025-08-17/22-50-12/train.log +0 -0
- first_run_output/run_docker.sh +7 -0
first_run_output/manual_launch.sh
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
export LOGDIR=/mlperf_logdir # set the place where the output logs will be saved
|
| 3 |
+
|
| 4 |
+
export SLURM_JOBID=$(date +"%y-%m-%d__%H-%M-%S") # unique local ID for each run
|
| 5 |
+
export LOCAL_WORLD_SIZE=0 # non-slurm mode, set to 0 to use torchrun
|
| 6 |
+
|
| 7 |
+
# DO NOT SET DATA_ROOT & CKPT_ROOT
|
| 8 |
+
|
| 9 |
+
source config_DGXB200_1x8x1xtp1pp1cp1.sh
|
| 10 |
+
source run_and_time.sh
|
| 11 |
+
set +x
|
first_run_output/mlperf.log
ADDED
|
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
W0817 22:49:50.668000 5653 torch/distributed/run.py:766]
|
| 2 |
+
W0817 22:49:50.668000 5653 torch/distributed/run.py:766] *****************************************
|
| 3 |
+
W0817 22:49:50.668000 5653 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
|
| 4 |
+
W0817 22:49:50.668000 5653 torch/distributed/run.py:766] *****************************************
|
| 5 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 6 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 7 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 8 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 9 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 10 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 11 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 12 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 13 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 14 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 15 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 16 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 17 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 18 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 19 |
+
Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
|
| 20 |
+
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
|
| 21 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 22 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 23 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 24 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 25 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 26 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 27 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012105, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 28 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012105, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 29 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 30 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 31 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 32 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 33 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 34 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 35 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 36 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 37 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 38 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 39 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012107, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 40 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 41 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 42 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 43 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 44 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 45 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 46 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 47 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 48 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 49 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012110, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 50 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012110, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 51 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 52 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 53 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 54 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 55 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 56 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
|
| 57 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 58 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 59 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 60 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 61 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 62 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 63 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 64 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 65 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 66 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 67 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 68 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
|
| 69 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 70 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 71 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 72 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 73 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012645, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 74 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012645, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 75 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012649, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 76 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012666, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 77 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 78 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 79 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 80 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012685, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 81 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012686, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 82 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012686, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 83 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012689, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 84 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 85 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 86 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 87 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012691, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 88 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 89 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 90 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 91 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 92 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 93 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 94 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 95 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 96 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 97 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 98 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 99 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 100 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 101 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 102 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012695, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 103 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012697, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 104 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012697, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 105 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 106 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 107 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 108 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012700, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 109 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012701, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 110 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012701, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 111 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012703, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 112 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012704, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 113 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 114 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 115 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 116 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 117 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 118 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 119 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012713, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 120 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012714, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 121 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012718, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 122 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012719, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 123 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
|
| 124 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
|
| 125 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
|
| 126 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012727, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
|
| 127 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012728, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
|
| 128 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012728, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
|
| 129 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012731, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
|
| 130 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012734, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 131 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471012745, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 132 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471013232, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 8, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
|
| 133 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471013270, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 3901, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
|
| 134 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471013270, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 173, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
|
| 135 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471013271, "event_type": "POINT_IN_TIME", "key": "gradient_accumulation_steps", "value": 1, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
|
| 136 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471013272, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
|
| 137 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471019434, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"before_model_init": 6.738934484000083}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
|
| 138 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471019889, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"after_model_init": 0.4552682889998323}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
|
| 139 |
+
Loading distributed checkpoint with TensorStoreLoadShardedStrategy
|
| 140 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471283851, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"warmup_time": 263.961781042}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
|
| 141 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471283851, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"init_finished": 0.00043348400004106225}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
|
| 142 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471283852, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 83}}
|
| 143 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471283853, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 83}}
|
| 144 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471283854, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 0}}
|
| 145 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471298842, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 2.231356143951416, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 80, "lr": 0.0005497879849661988}}
|
| 146 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471314118, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.523491382598877, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 160, "lr": 0.0005491522667766103}}
|
| 147 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471329437, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3142896890640259, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 240, "lr": 0.0005480938256626048}}
|
| 148 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471344748, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3286864757537842, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 320, "lr": 0.0005466142936636629}}
|
| 149 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471360056, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3191031217575073, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 400, "lr": 0.0005447159521108884}}
|
| 150 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471375386, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3337409496307373, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 480, "lr": 0.0005424017281093611}}
|
| 151 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471390691, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3527849912643433, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 560, "lr": 0.000539675190024753}}
|
| 152 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471405987, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3663201332092285, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 640, "lr": 0.0005365405419811673}}
|
| 153 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471421321, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.436583161354065, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 720, "lr": 0.0005330026173786832}}
|
| 154 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471436648, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2180163860321045, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 800, "lr": 0.0005290668714406038}}
|
| 155 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471451982, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3267669677734375, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 880, "lr": 0.0005247393728018974}}
|
| 156 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471467318, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3912932872772217, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 960, "lr": 0.0005200267941518012}}
|
| 157 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471482641, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.320623517036438, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1040, "lr": 0.0005149364019450193}}
|
| 158 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471497959, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2857550382614136, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1120, "lr": 0.0005094760451973754}}
|
| 159 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471513310, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.356960415840149, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1200, "lr": 0.0005036541433832}}
|
| 160 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471528663, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.325950026512146, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1280, "lr": 0.0004974796734531106}}
|
| 161 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471544018, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3832132816314697, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1360, "lr": 0.0004909621559922049}}
|
| 162 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471559347, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2853686809539795, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1440, "lr": 0.0004841116405400086}}
|
| 163 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471574682, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3469711542129517, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1520, "lr": 0.00047693869009481353}}
|
| 164 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.220374784959472, "train_step_time": 1.5324570226354173, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 0}}
|
| 165 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 0}}
|
| 166 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 1536}}
|
| 167 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471592941, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9386916849654534, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 1536}}
|
| 168 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 11.84517688090803}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 1536}}
|
| 169 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 1536}}
|
| 170 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 1536}}
|
| 171 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471605223, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2639729976654053, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1600, "lr": 0.0004694543648263006}}
|
| 172 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471620533, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.359060525894165, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1680, "lr": 0.00046167020502155905}}
|
| 173 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471635863, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3872255086898804, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1760, "lr": 0.00045359821329080054}}
|
| 174 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471651197, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2579522132873535, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1840, "lr": 0.00044525083606020437}}
|
| 175 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471666501, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2951648235321045, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1920, "lr": 0.0004366409443804302}}
|
| 176 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.218929434605367, "train_step_time": 1.5328814271666669, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 1536}}
|
| 177 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 1536}}
|
| 178 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 1920}}
|
| 179 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9341272784106304, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 1920}}
|
| 180 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 11.889916819247087}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 1920}}
|
| 181 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 1920}}
|
| 182 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471681323, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 1920}}
|
| 183 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471696647, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.311585783958435, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2000, "lr": 0.0004277818140803907}}
|
| 184 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471711976, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3409117460250854, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2080, "lr": 0.00041868710529688595}}
|
| 185 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471727298, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3523324728012085, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2160, "lr": 0.00040937084141166267}}
|
| 186 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471742649, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2437388896942139, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2240, "lr": 0.0003998473874283754}}
|
| 187 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.21650721148526, "train_step_time": 1.5335932024375012, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 1920}}
|
| 188 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 1920}}
|
| 189 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 2304}}
|
| 190 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9300852869287392, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 2304}}
|
| 191 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.04804375654437}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 2304}}
|
| 192 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 2304}}
|
| 193 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 2304}}
|
| 194 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471772623, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3073469400405884, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2320, "lr": 0.00039013142782279276}}
|
| 195 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471787980, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.260759711265564, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2400, "lr": 0.00038023794390039975}}
|
| 196 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471803328, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2937121391296387, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2480, "lr": 0.00037018219069631056}}
|
| 197 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471818672, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3011728525161743, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2560, "lr": 0.00035997967345311057}}
|
| 198 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471834026, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3377065658569336, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2640, "lr": 0.00034964612371289557}}
|
| 199 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471843261, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.2090510322980235, "train_step_time": 1.5357883711250035, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 2304}}
|
| 200 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471843262, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 2304}}
|
| 201 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471843262, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 2688}}
|
| 202 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471857902, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9281238710260116, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 2688}}
|
| 203 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.02044647758139}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 2688}}
|
| 204 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 2688}}
|
| 205 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 2688}}
|
| 206 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471864050, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3667250871658325, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2720, "lr": 0.000339197475060374}}
|
| 207 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471879394, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3398393392562866, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2800, "lr": 0.00032864983855443534}}
|
| 208 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471894731, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2728362083435059, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2880, "lr": 0.0003180194778860635}}
|
| 209 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471910087, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2875945568084717, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2960, "lr": 0.0003073227843009054}}
|
| 210 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471925418, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3862403631210327, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 3040, "lr": 0.0002965762513251574}}
|
| 211 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.212281417349078, "train_step_time": 1.5348365445833376, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 2688}}
|
| 212 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 2688}}
|
| 213 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 3072}}
|
| 214 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471946223, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9221087328960441, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 3072}}
|
| 215 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471946223, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.014723364925022}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 3072}}
|
| 216 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471946224, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 3072}}
|
| 217 |
+
:::MLLOG {"namespace": "", "time_ms": 1755471946230, "event_type": "INTERVAL_END", "key": "run_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 106, "samples_count": 3072, "status": "success"}}
|
first_run_output/nemo_experiments/default/2025-08-17_22-50-13/nemo_log_globalrank-0_localrank-0.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-1_localrank-1.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-2_localrank-2.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-3_localrank-3.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-4_localrank-4.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-5_localrank-5.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-6_localrank-6.txt
ADDED
|
File without changes
|
first_run_output/nemo_experiments/default/nemo_log_globalrank-7_localrank-7.txt
ADDED
|
File without changes
|
first_run_output/outputs/2025-08-17/22-50-12/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
skip_evals: ${oc.decode:${oc.env:SKIP_EVAL,${floor:${add:${multiply:0.125,${model.global_batch_size}},2}}}}
|
| 2 |
+
load_ckpt: ${oc.decode:${oc.env:LOAD_CKPT,False}}
|
| 3 |
+
data_root: ${oc.decode:${oc.env:DATA_ROOT,/data}}
|
| 4 |
+
ckpt_root: ${oc.decode:${oc.env:CKPT_ROOT,/ckpt}}
|
| 5 |
+
trainer:
|
| 6 |
+
devices: ${oc.decode:${oc.env:DGXNGPU,8}}
|
| 7 |
+
num_nodes: ${oc.decode:${oc.env:DGXNNODES,1}}
|
| 8 |
+
max_steps: ${oc.decode:${oc.env:MAX_STEPS,null}}
|
| 9 |
+
val_check_interval: ${floor_div:${multiply:${add:${skip_evals},1},${oc.decode:${oc.env:VAL_CHECK_INTERVAL,384}}},${model.global_batch_size}}
|
| 10 |
+
limit_val_batches: ${oc.decode:${oc.env:LIMIT_VAL_BATCHES,1.0}}
|
| 11 |
+
model:
|
| 12 |
+
num_layers: ${oc.decode:${oc.env:OVERWRITTEN_NUM_LAYERS,80}}
|
| 13 |
+
seed: ${oc.decode:${oc.env:SEED,1}}
|
| 14 |
+
tensor_model_parallel_size: ${oc.decode:${oc.env:TP,1}}
|
| 15 |
+
pipeline_model_parallel_size: ${oc.decode:${oc.env:PP,1}}
|
| 16 |
+
context_parallel_size: ${oc.decode:${oc.env:CP,1}}
|
| 17 |
+
eval_cp: ${oc.decode:${oc.env:CP_EVAL,null}}
|
| 18 |
+
global_batch_size: ${floor_div:${multiply:${oc.decode:${oc.env:MINIBS,1}},${floor_div:${multiply:${trainer.devices},${trainer.num_nodes}},${multiply:${model.tensor_model_parallel_size},${model.pipeline_model_parallel_size}}}},${oc.decode:${oc.env:CP,1}}}
|
| 19 |
+
micro_batch_size: ${oc.decode:${oc.env:MBS,1}}
|
| 20 |
+
val_micro_batch_size: ${oc.decode:${oc.env:VAL_MBS,null}}
|
| 21 |
+
val_global_batch_size: ${floor_div:${multiply:${oc.decode:${oc.env:VAL_MBS,1}},${floor_div:${multiply:${trainer.devices},${trainer.num_nodes}},${multiply:${model.tensor_model_parallel_size},${model.pipeline_model_parallel_size}}}},${oc.decode:${oc.env:CP_EVAL,${oc.env:CP,1}}}}
|
| 22 |
+
max_position_embeddings: ${oc.decode:${oc.env:MAX_SEQLEN,8192}}
|
| 23 |
+
encoder_seq_length: ${oc.decode:${oc.env:MAX_SEQLEN,8192}}
|
| 24 |
+
sequence_parallel: ${oc.decode:${oc.env:SP,False}}
|
| 25 |
+
ub_tp_comm_overlap: ${oc.decode:${oc.env:TP_COMM_OVERLAP,False}}
|
| 26 |
+
fp8: ${oc.decode:${oc.env:FP8,True}}
|
| 27 |
+
fp8_params: ${oc.decode:${oc.env:FP8,True}}
|
| 28 |
+
fp8_hybrid: ${oc.decode:${oc.env:FP8_HYBRID,True}}
|
| 29 |
+
fp8_amax_history_len: ${oc.decode:${oc.env:FP8_AMAX_HISTORY,128}}
|
| 30 |
+
fp8_amax_compute_algo: ${oc.env:FP8_AMAX_ALGO,most_recent}
|
| 31 |
+
reduce_amax: ${oc.decode:${oc.env:FP8_REDUCE_AMAX,False}}
|
| 32 |
+
fp8_e4m3: ${oc.decode:${oc.env:FP8_E4M3,False}}
|
| 33 |
+
fp8_interval: ${oc.decode:${oc.env:FP8_INTERVAL,1}}
|
| 34 |
+
fp8_margin: ${oc.decode:${oc.env:FP8_MARGIN,0}}
|
| 35 |
+
fp8_dot_product_attention: ${oc.decode:${oc.env:FP8_DPA,0}}
|
| 36 |
+
cp_comm_type: ${oc.decode:${oc.env:CP_COMM_TYPE,'a2a'}}
|
| 37 |
+
activation_func_fp8_input_store: ${oc.decode:${oc.env:FP8_ACT,0}}
|
| 38 |
+
external_cuda_graph: ${oc.decode:${oc.env:LAYER_CUDA_GRAPH,False}}
|
| 39 |
+
enable_cuda_graph: ${oc.decode:${oc.env:MCORE_CUDA_GRAPH,False}}
|
| 40 |
+
use_te_rng_tracker: ${oc.decode:${oc.env:USE_TE_RNG_TRACKER,True}}
|
| 41 |
+
enable_cg_fp8_weight_caching: ${oc.decode:${oc.env:CG_WEIGHT_CACHING,True}}
|
| 42 |
+
cpu_offloading: ${oc.decode:${oc.env:CPU_OFFLOADING,False}}
|
| 43 |
+
cpu_offloading_num_layers: ${oc.decode:${oc.env:CPU_OFFLOADING_NUM_LAYERS,20}}
|
| 44 |
+
cpu_offloading_activations: true
|
| 45 |
+
cpu_offloading_weights: false
|
| 46 |
+
memory_profile:
|
| 47 |
+
enabled: ${oc.decode:${oc.env:MEMORY_PROFILE,False}}
|
| 48 |
+
start_step: 1
|
| 49 |
+
end_step: 4
|
| 50 |
+
rank: 0
|
| 51 |
+
output_path: /results/
|
| 52 |
+
custom:
|
| 53 |
+
warmup: ${oc.decode:${oc.env:WARMUP,False}}
|
| 54 |
+
warmup_train_steps: ${oc.decode:${oc.env:WARMUP_TRAIN_STEPS,5}}
|
| 55 |
+
warmup_validation_steps: ${oc.decode:${oc.env:WARMUP_VALIDATION_STEPS,5}}
|
| 56 |
+
reset_fp8_stats_after_warmup: ${oc.decode:${oc.env:RESET_FP8_STATS_AFTER_WARMUP,1}}
|
| 57 |
+
optim:
|
| 58 |
+
lr: ${oc.decode:${oc.env:LR,0.0004}}
|
| 59 |
+
use_distributed_optimizer: ${oc.decode:${oc.env:USE_DISTRIBUTED_OPTIMIZER,True}}
|
| 60 |
+
overlap_param_gather_with_optimizer_step: ${oc.decode:${oc.env:OVERLAP_PARAM_GATHER_WITH_OPTIMIZER_STEP,False}}
|
| 61 |
+
sched:
|
| 62 |
+
warmup_steps: ${oc.decode:${oc.env:WARMUP_STEPS,0}}
|
| 63 |
+
ddp:
|
| 64 |
+
overlap_grad_reduce: ${oc.decode:${oc.env:DDP_OVERLAP_GRAD_REDUCE,False}}
|
| 65 |
+
overlap_param_gather: ${oc.decode:${oc.env:DDP_OVERLAP_PARAM_GATHER,False}}
|
| 66 |
+
fp8_param_gather: ${oc.decode:${oc.env:DDP_FP8_PARAM_GATHER,False}}
|
| 67 |
+
average_in_collective: ${oc.decode:${oc.env:DDP_AVERAGE_IN_COLLECTIVE,False}}
|
first_run_output/outputs/2025-08-17/22-50-12/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task: []
|
| 115 |
+
job:
|
| 116 |
+
name: train
|
| 117 |
+
chdir: null
|
| 118 |
+
override_dirname: ''
|
| 119 |
+
id: ???
|
| 120 |
+
num: ???
|
| 121 |
+
config_name: megatron_gpt_peft_tuning_config
|
| 122 |
+
env_set: {}
|
| 123 |
+
env_copy: []
|
| 124 |
+
config:
|
| 125 |
+
override_dirname:
|
| 126 |
+
kv_sep: '='
|
| 127 |
+
item_sep: ','
|
| 128 |
+
exclude_keys: []
|
| 129 |
+
runtime:
|
| 130 |
+
version: 1.3.2
|
| 131 |
+
version_base: '1.3'
|
| 132 |
+
cwd: /workspace/ft-llm
|
| 133 |
+
config_sources:
|
| 134 |
+
- path: hydra.conf
|
| 135 |
+
schema: pkg
|
| 136 |
+
provider: hydra
|
| 137 |
+
- path: /workspace/ft-llm/conf
|
| 138 |
+
schema: file
|
| 139 |
+
provider: main
|
| 140 |
+
- path: ''
|
| 141 |
+
schema: structured
|
| 142 |
+
provider: schema
|
| 143 |
+
output_dir: /workspace/ft-llm/outputs/2025-08-17/22-50-12
|
| 144 |
+
choices:
|
| 145 |
+
tp_overlap@model.ub_tp_comm_overlap_cfg: b100tp1mbs1
|
| 146 |
+
hydra/env: default
|
| 147 |
+
hydra/callbacks: null
|
| 148 |
+
hydra/job_logging: default
|
| 149 |
+
hydra/hydra_logging: default
|
| 150 |
+
hydra/hydra_help: default
|
| 151 |
+
hydra/help: default
|
| 152 |
+
hydra/sweeper: basic
|
| 153 |
+
hydra/launcher: basic
|
| 154 |
+
hydra/output: default
|
| 155 |
+
verbose: false
|
first_run_output/outputs/2025-08-17/22-50-12/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[]
|
first_run_output/outputs/2025-08-17/22-50-12/train.log
ADDED
|
File without changes
|
first_run_output/run_docker.sh
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
docker run -d --gpus all -it --rm --network=host --ipc=host \
|
| 2 |
+
-v /home/shadeform/work/dev/mlperf/data:/data \
|
| 3 |
+
-v /home/shadeform/work/dev/mlperf/model:/ckpt \
|
| 4 |
+
-v /home/shadeform/work/dev/mlperf:/myworkspace \
|
| 5 |
+
--shm-size=16g \
|
| 6 |
+
--ulimit memlock=-1 --ulimit stack=67108864 \
|
| 7 |
+
vuiseng9/mlperf-nvidia:llama2_70b_lora-pyt_v5.0
|