WIP

Browse files

Files changed (15) hide show

first_run_output/manual_launch.sh +11 -0
first_run_output/mlperf.log +217 -0
first_run_output/nemo_experiments/default/2025-08-17_22-50-13/nemo_log_globalrank-0_localrank-0.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-1_localrank-1.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-2_localrank-2.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-3_localrank-3.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-4_localrank-4.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-5_localrank-5.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-6_localrank-6.txt +0 -0
first_run_output/nemo_experiments/default/nemo_log_globalrank-7_localrank-7.txt +0 -0
first_run_output/outputs/2025-08-17/22-50-12/.hydra/config.yaml +67 -0
first_run_output/outputs/2025-08-17/22-50-12/.hydra/hydra.yaml +155 -0
first_run_output/outputs/2025-08-17/22-50-12/.hydra/overrides.yaml +1 -0
first_run_output/outputs/2025-08-17/22-50-12/train.log +0 -0
first_run_output/run_docker.sh +7 -0

first_run_output/manual_launch.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+set -x
+export LOGDIR=/mlperf_logdir  # set the place where the output logs will be saved
+export SLURM_JOBID=$(date +"%y-%m-%d__%H-%M-%S")  # unique local ID for each run
+export LOCAL_WORLD_SIZE=0 # non-slurm mode, set to 0 to use torchrun
+# DO NOT SET DATA_ROOT & CKPT_ROOT
+source config_DGXB200_1x8x1xtp1pp1cp1.sh
+source run_and_time.sh
+set +x

first_run_output/mlperf.log ADDED Viewed

	@@ -0,0 +1,217 @@

+W0817 22:49:50.668000 5653 torch/distributed/run.py:766]
+W0817 22:49:50.668000 5653 torch/distributed/run.py:766] *****************************************
+W0817 22:49:50.668000 5653 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+W0817 22:49:50.668000 5653 torch/distributed/run.py:766] *****************************************
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
+The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
+:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012105, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012105, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012107, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012110, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012110, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012645, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012645, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012649, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012666, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012685, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012686, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012686, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012689, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012691, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012695, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012697, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012697, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012700, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012701, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012701, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012703, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012704, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012713, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012714, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012718, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012719, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012727, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012728, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012728, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012731, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012734, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471012745, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471013232, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 8, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471013270, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 3901, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471013270, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 173, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471013271, "event_type": "POINT_IN_TIME", "key": "gradient_accumulation_steps", "value": 1, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
+:::MLLOG {"namespace": "", "time_ms": 1755471013272, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
+:::MLLOG {"namespace": "", "time_ms": 1755471019434, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"before_model_init": 6.738934484000083}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
+:::MLLOG {"namespace": "", "time_ms": 1755471019889, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"after_model_init": 0.4552682889998323}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
+Loading distributed checkpoint with TensorStoreLoadShardedStrategy
+:::MLLOG {"namespace": "", "time_ms": 1755471283851, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"warmup_time": 263.961781042}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
+:::MLLOG {"namespace": "", "time_ms": 1755471283851, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"init_finished": 0.00043348400004106225}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
+:::MLLOG {"namespace": "", "time_ms": 1755471283852, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 83}}
+:::MLLOG {"namespace": "", "time_ms": 1755471283853, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 83}}
+:::MLLOG {"namespace": "", "time_ms": 1755471283854, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 0}}
+:::MLLOG {"namespace": "", "time_ms": 1755471298842, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 2.231356143951416, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 80, "lr": 0.0005497879849661988}}
+:::MLLOG {"namespace": "", "time_ms": 1755471314118, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.523491382598877, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 160, "lr": 0.0005491522667766103}}
+:::MLLOG {"namespace": "", "time_ms": 1755471329437, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3142896890640259, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 240, "lr": 0.0005480938256626048}}
+:::MLLOG {"namespace": "", "time_ms": 1755471344748, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3286864757537842, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 320, "lr": 0.0005466142936636629}}
+:::MLLOG {"namespace": "", "time_ms": 1755471360056, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3191031217575073, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 400, "lr": 0.0005447159521108884}}
+:::MLLOG {"namespace": "", "time_ms": 1755471375386, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3337409496307373, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 480, "lr": 0.0005424017281093611}}
+:::MLLOG {"namespace": "", "time_ms": 1755471390691, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3527849912643433, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 560, "lr": 0.000539675190024753}}
+:::MLLOG {"namespace": "", "time_ms": 1755471405987, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3663201332092285, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 640, "lr": 0.0005365405419811673}}
+:::MLLOG {"namespace": "", "time_ms": 1755471421321, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.436583161354065, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 720, "lr": 0.0005330026173786832}}
+:::MLLOG {"namespace": "", "time_ms": 1755471436648, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2180163860321045, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 800, "lr": 0.0005290668714406038}}
+:::MLLOG {"namespace": "", "time_ms": 1755471451982, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3267669677734375, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 880, "lr": 0.0005247393728018974}}
+:::MLLOG {"namespace": "", "time_ms": 1755471467318, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3912932872772217, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 960, "lr": 0.0005200267941518012}}
+:::MLLOG {"namespace": "", "time_ms": 1755471482641, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.320623517036438, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1040, "lr": 0.0005149364019450193}}
+:::MLLOG {"namespace": "", "time_ms": 1755471497959, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2857550382614136, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1120, "lr": 0.0005094760451973754}}
+:::MLLOG {"namespace": "", "time_ms": 1755471513310, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.356960415840149, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1200, "lr": 0.0005036541433832}}
+:::MLLOG {"namespace": "", "time_ms": 1755471528663, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.325950026512146, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1280, "lr": 0.0004974796734531106}}
+:::MLLOG {"namespace": "", "time_ms": 1755471544018, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3832132816314697, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1360, "lr": 0.0004909621559922049}}
+:::MLLOG {"namespace": "", "time_ms": 1755471559347, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2853686809539795, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1440, "lr": 0.0004841116405400086}}
+:::MLLOG {"namespace": "", "time_ms": 1755471574682, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3469711542129517, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1520, "lr": 0.00047693869009481353}}
+:::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.220374784959472, "train_step_time": 1.5324570226354173, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 0}}
+:::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 0}}
+:::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471592941, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9386916849654534, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 11.84517688090803}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471605223, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2639729976654053, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1600, "lr": 0.0004694543648263006}}
+:::MLLOG {"namespace": "", "time_ms": 1755471620533, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.359060525894165, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1680, "lr": 0.00046167020502155905}}
+:::MLLOG {"namespace": "", "time_ms": 1755471635863, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3872255086898804, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1760, "lr": 0.00045359821329080054}}
+:::MLLOG {"namespace": "", "time_ms": 1755471651197, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2579522132873535, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1840, "lr": 0.00044525083606020437}}
+:::MLLOG {"namespace": "", "time_ms": 1755471666501, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2951648235321045, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1920, "lr": 0.0004366409443804302}}
+:::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.218929434605367, "train_step_time": 1.5328814271666669, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 1536}}
+:::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9341272784106304, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 11.889916819247087}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471681323, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471696647, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.311585783958435, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2000, "lr": 0.0004277818140803907}}
+:::MLLOG {"namespace": "", "time_ms": 1755471711976, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3409117460250854, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2080, "lr": 0.00041868710529688595}}
+:::MLLOG {"namespace": "", "time_ms": 1755471727298, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3523324728012085, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2160, "lr": 0.00040937084141166267}}
+:::MLLOG {"namespace": "", "time_ms": 1755471742649, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2437388896942139, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2240, "lr": 0.0003998473874283754}}
+:::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.21650721148526, "train_step_time": 1.5335932024375012, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 1920}}
+:::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9300852869287392, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.04804375654437}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471772623, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3073469400405884, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2320, "lr": 0.00039013142782279276}}
+:::MLLOG {"namespace": "", "time_ms": 1755471787980, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.260759711265564, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2400, "lr": 0.00038023794390039975}}
+:::MLLOG {"namespace": "", "time_ms": 1755471803328, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2937121391296387, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2480, "lr": 0.00037018219069631056}}
+:::MLLOG {"namespace": "", "time_ms": 1755471818672, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3011728525161743, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2560, "lr": 0.00035997967345311057}}
+:::MLLOG {"namespace": "", "time_ms": 1755471834026, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3377065658569336, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2640, "lr": 0.00034964612371289557}}
+:::MLLOG {"namespace": "", "time_ms": 1755471843261, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.2090510322980235, "train_step_time": 1.5357883711250035, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471843262, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 2304}}
+:::MLLOG {"namespace": "", "time_ms": 1755471843262, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471857902, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9281238710260116, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.02044647758139}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471864050, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3667250871658325, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2720, "lr": 0.000339197475060374}}
+:::MLLOG {"namespace": "", "time_ms": 1755471879394, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3398393392562866, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2800, "lr": 0.00032864983855443534}}
+:::MLLOG {"namespace": "", "time_ms": 1755471894731, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2728362083435059, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2880, "lr": 0.0003180194778860635}}
+:::MLLOG {"namespace": "", "time_ms": 1755471910087, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2875945568084717, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2960, "lr": 0.0003073227843009054}}
+:::MLLOG {"namespace": "", "time_ms": 1755471925418, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3862403631210327, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 3040, "lr": 0.0002965762513251574}}
+:::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.212281417349078, "train_step_time": 1.5348365445833376, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 2688}}
+:::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 3072}}
+:::MLLOG {"namespace": "", "time_ms": 1755471946223, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9221087328960441, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 3072}}
+:::MLLOG {"namespace": "", "time_ms": 1755471946223, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.014723364925022}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 3072}}
+:::MLLOG {"namespace": "", "time_ms": 1755471946224, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 3072}}
+:::MLLOG {"namespace": "", "time_ms": 1755471946230, "event_type": "INTERVAL_END", "key": "run_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 106, "samples_count": 3072, "status": "success"}}

first_run_output/nemo_experiments/default/2025-08-17_22-50-13/nemo_log_globalrank-0_localrank-0.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-1_localrank-1.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-2_localrank-2.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-3_localrank-3.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-4_localrank-4.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-5_localrank-5.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-6_localrank-6.txt ADDED Viewed

File without changes

first_run_output/nemo_experiments/default/nemo_log_globalrank-7_localrank-7.txt ADDED Viewed

File without changes

first_run_output/outputs/2025-08-17/22-50-12/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+skip_evals: ${oc.decode:${oc.env:SKIP_EVAL,${floor:${add:${multiply:0.125,${model.global_batch_size}},2}}}}
+load_ckpt: ${oc.decode:${oc.env:LOAD_CKPT,False}}
+data_root: ${oc.decode:${oc.env:DATA_ROOT,/data}}
+ckpt_root: ${oc.decode:${oc.env:CKPT_ROOT,/ckpt}}
+trainer:
+  devices: ${oc.decode:${oc.env:DGXNGPU,8}}
+  num_nodes: ${oc.decode:${oc.env:DGXNNODES,1}}
+  max_steps: ${oc.decode:${oc.env:MAX_STEPS,null}}
+  val_check_interval: ${floor_div:${multiply:${add:${skip_evals},1},${oc.decode:${oc.env:VAL_CHECK_INTERVAL,384}}},${model.global_batch_size}}
+  limit_val_batches: ${oc.decode:${oc.env:LIMIT_VAL_BATCHES,1.0}}
+model:
+  num_layers: ${oc.decode:${oc.env:OVERWRITTEN_NUM_LAYERS,80}}
+  seed: ${oc.decode:${oc.env:SEED,1}}
+  tensor_model_parallel_size: ${oc.decode:${oc.env:TP,1}}
+  pipeline_model_parallel_size: ${oc.decode:${oc.env:PP,1}}
+  context_parallel_size: ${oc.decode:${oc.env:CP,1}}
+  eval_cp: ${oc.decode:${oc.env:CP_EVAL,null}}
+  global_batch_size: ${floor_div:${multiply:${oc.decode:${oc.env:MINIBS,1}},${floor_div:${multiply:${trainer.devices},${trainer.num_nodes}},${multiply:${model.tensor_model_parallel_size},${model.pipeline_model_parallel_size}}}},${oc.decode:${oc.env:CP,1}}}
+  micro_batch_size: ${oc.decode:${oc.env:MBS,1}}
+  val_micro_batch_size: ${oc.decode:${oc.env:VAL_MBS,null}}
+  val_global_batch_size: ${floor_div:${multiply:${oc.decode:${oc.env:VAL_MBS,1}},${floor_div:${multiply:${trainer.devices},${trainer.num_nodes}},${multiply:${model.tensor_model_parallel_size},${model.pipeline_model_parallel_size}}}},${oc.decode:${oc.env:CP_EVAL,${oc.env:CP,1}}}}
+  max_position_embeddings: ${oc.decode:${oc.env:MAX_SEQLEN,8192}}
+  encoder_seq_length: ${oc.decode:${oc.env:MAX_SEQLEN,8192}}
+  sequence_parallel: ${oc.decode:${oc.env:SP,False}}
+  ub_tp_comm_overlap: ${oc.decode:${oc.env:TP_COMM_OVERLAP,False}}
+  fp8: ${oc.decode:${oc.env:FP8,True}}
+  fp8_params: ${oc.decode:${oc.env:FP8,True}}
+  fp8_hybrid: ${oc.decode:${oc.env:FP8_HYBRID,True}}
+  fp8_amax_history_len: ${oc.decode:${oc.env:FP8_AMAX_HISTORY,128}}
+  fp8_amax_compute_algo: ${oc.env:FP8_AMAX_ALGO,most_recent}
+  reduce_amax: ${oc.decode:${oc.env:FP8_REDUCE_AMAX,False}}
+  fp8_e4m3: ${oc.decode:${oc.env:FP8_E4M3,False}}
+  fp8_interval: ${oc.decode:${oc.env:FP8_INTERVAL,1}}
+  fp8_margin: ${oc.decode:${oc.env:FP8_MARGIN,0}}
+  fp8_dot_product_attention: ${oc.decode:${oc.env:FP8_DPA,0}}
+  cp_comm_type: ${oc.decode:${oc.env:CP_COMM_TYPE,'a2a'}}
+  activation_func_fp8_input_store: ${oc.decode:${oc.env:FP8_ACT,0}}
+  external_cuda_graph: ${oc.decode:${oc.env:LAYER_CUDA_GRAPH,False}}
+  enable_cuda_graph: ${oc.decode:${oc.env:MCORE_CUDA_GRAPH,False}}
+  use_te_rng_tracker: ${oc.decode:${oc.env:USE_TE_RNG_TRACKER,True}}
+  enable_cg_fp8_weight_caching: ${oc.decode:${oc.env:CG_WEIGHT_CACHING,True}}
+  cpu_offloading: ${oc.decode:${oc.env:CPU_OFFLOADING,False}}
+  cpu_offloading_num_layers: ${oc.decode:${oc.env:CPU_OFFLOADING_NUM_LAYERS,20}}
+  cpu_offloading_activations: true
+  cpu_offloading_weights: false
+  memory_profile:
+    enabled: ${oc.decode:${oc.env:MEMORY_PROFILE,False}}
+    start_step: 1
+    end_step: 4
+    rank: 0
+    output_path: /results/
+  custom:
+    warmup: ${oc.decode:${oc.env:WARMUP,False}}
+    warmup_train_steps: ${oc.decode:${oc.env:WARMUP_TRAIN_STEPS,5}}
+    warmup_validation_steps: ${oc.decode:${oc.env:WARMUP_VALIDATION_STEPS,5}}
+    reset_fp8_stats_after_warmup: ${oc.decode:${oc.env:RESET_FP8_STATS_AFTER_WARMUP,1}}
+optim:
+  lr: ${oc.decode:${oc.env:LR,0.0004}}
+  use_distributed_optimizer: ${oc.decode:${oc.env:USE_DISTRIBUTED_OPTIMIZER,True}}
+  overlap_param_gather_with_optimizer_step: ${oc.decode:${oc.env:OVERLAP_PARAM_GATHER_WITH_OPTIMIZER_STEP,False}}
+  sched:
+    warmup_steps: ${oc.decode:${oc.env:WARMUP_STEPS,0}}
+ddp:
+  overlap_grad_reduce: ${oc.decode:${oc.env:DDP_OVERLAP_GRAD_REDUCE,False}}
+  overlap_param_gather: ${oc.decode:${oc.env:DDP_OVERLAP_PARAM_GATHER,False}}
+  fp8_param_gather: ${oc.decode:${oc.env:DDP_FP8_PARAM_GATHER,False}}
+  average_in_collective: ${oc.decode:${oc.env:DDP_AVERAGE_IN_COLLECTIVE,False}}

first_run_output/outputs/2025-08-17/22-50-12/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,155 @@

+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task: []
+  job:
+    name: train
+    chdir: null
+    override_dirname: ''
+    id: ???
+    num: ???
+    config_name: megatron_gpt_peft_tuning_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /workspace/ft-llm
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /workspace/ft-llm/conf
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /workspace/ft-llm/outputs/2025-08-17/22-50-12
+    choices:
+      tp_overlap@model.ub_tp_comm_overlap_cfg: b100tp1mbs1
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

first_run_output/outputs/2025-08-17/22-50-12/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

first_run_output/outputs/2025-08-17/22-50-12/train.log ADDED Viewed

File without changes

first_run_output/run_docker.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+docker run -d --gpus all -it --rm --network=host --ipc=host \
+    -v /home/shadeform/work/dev/mlperf/data:/data \
+    -v /home/shadeform/work/dev/mlperf/model:/ckpt \
+    -v /home/shadeform/work/dev/mlperf:/myworkspace \
+    --shm-size=16g \
+    --ulimit memlock=-1 --ulimit stack=67108864 \
+    vuiseng9/mlperf-nvidia:llama2_70b_lora-pyt_v5.0