vuiseng9 commited on
Commit
a8b07e8
·
1 Parent(s): be51eb8
first_run_output/manual_launch.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ export LOGDIR=/mlperf_logdir # set the place where the output logs will be saved
3
+
4
+ export SLURM_JOBID=$(date +"%y-%m-%d__%H-%M-%S") # unique local ID for each run
5
+ export LOCAL_WORLD_SIZE=0 # non-slurm mode, set to 0 to use torchrun
6
+
7
+ # DO NOT SET DATA_ROOT & CKPT_ROOT
8
+
9
+ source config_DGXB200_1x8x1xtp1pp1cp1.sh
10
+ source run_and_time.sh
11
+ set +x
first_run_output/mlperf.log ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ W0817 22:49:50.668000 5653 torch/distributed/run.py:766]
2
+ W0817 22:49:50.668000 5653 torch/distributed/run.py:766] *****************************************
3
+ W0817 22:49:50.668000 5653 torch/distributed/run.py:766] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
+ W0817 22:49:50.668000 5653 torch/distributed/run.py:766] *****************************************
5
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
6
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
7
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
8
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
9
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
10
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
11
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
12
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
13
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
14
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
15
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
16
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
17
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
18
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
19
+ Could not find the bitsandbytes CUDA binary at PosixPath('/usr/local/lib/python3.12/dist-packages/bitsandbytes/libbitsandbytes_cuda129.so')
20
+ The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.
21
+ :::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
22
+ :::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
23
+ :::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
24
+ :::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
25
+ :::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
26
+ :::MLLOG {"namespace": "", "time_ms": 1755471012103, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
27
+ :::MLLOG {"namespace": "", "time_ms": 1755471012105, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
28
+ :::MLLOG {"namespace": "", "time_ms": 1755471012105, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
29
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
30
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
31
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
32
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
33
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
34
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
35
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
36
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
37
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
38
+ :::MLLOG {"namespace": "", "time_ms": 1755471012106, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
39
+ :::MLLOG {"namespace": "", "time_ms": 1755471012107, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
40
+ :::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
41
+ :::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
42
+ :::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
43
+ :::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
44
+ :::MLLOG {"namespace": "", "time_ms": 1755471012108, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
45
+ :::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
46
+ :::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
47
+ :::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
48
+ :::MLLOG {"namespace": "", "time_ms": 1755471012109, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
49
+ :::MLLOG {"namespace": "", "time_ms": 1755471012110, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
50
+ :::MLLOG {"namespace": "", "time_ms": 1755471012110, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
51
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
52
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
53
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
54
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
55
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
56
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "INTERVAL_START", "key": "init_start", "value": null, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 327}}
57
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
58
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
59
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
60
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
61
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_benchmark", "value": "llama2_70b_lora", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
62
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
63
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
64
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
65
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_org", "value": "SUBMISSION_ORG_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
66
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_division", "value": "closed", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
67
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_status", "value": "onprem", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
68
+ :::MLLOG {"namespace": "", "time_ms": 1755471012112, "event_type": "POINT_IN_TIME", "key": "submission_platform", "value": "1xSUBMISSION_PLATFORM_PLACEHOLDER", "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 328}}
69
+ :::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
70
+ :::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
71
+ :::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
72
+ :::MLLOG {"namespace": "", "time_ms": 1755471012644, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
73
+ :::MLLOG {"namespace": "", "time_ms": 1755471012645, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
74
+ :::MLLOG {"namespace": "", "time_ms": 1755471012645, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
75
+ :::MLLOG {"namespace": "", "time_ms": 1755471012649, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
76
+ :::MLLOG {"namespace": "", "time_ms": 1755471012666, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
77
+ :::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
78
+ :::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
79
+ :::MLLOG {"namespace": "", "time_ms": 1755471012684, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
80
+ :::MLLOG {"namespace": "", "time_ms": 1755471012685, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
81
+ :::MLLOG {"namespace": "", "time_ms": 1755471012686, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
82
+ :::MLLOG {"namespace": "", "time_ms": 1755471012686, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
83
+ :::MLLOG {"namespace": "", "time_ms": 1755471012689, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
84
+ :::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
85
+ :::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
86
+ :::MLLOG {"namespace": "", "time_ms": 1755471012690, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
87
+ :::MLLOG {"namespace": "", "time_ms": 1755471012691, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
88
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
89
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
90
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
91
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
92
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
93
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
94
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
95
+ :::MLLOG {"namespace": "", "time_ms": 1755471012692, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
96
+ :::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
97
+ :::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
98
+ :::MLLOG {"namespace": "", "time_ms": 1755471012693, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
99
+ :::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
100
+ :::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
101
+ :::MLLOG {"namespace": "", "time_ms": 1755471012694, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
102
+ :::MLLOG {"namespace": "", "time_ms": 1755471012695, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
103
+ :::MLLOG {"namespace": "", "time_ms": 1755471012697, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
104
+ :::MLLOG {"namespace": "", "time_ms": 1755471012697, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
105
+ :::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
106
+ :::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
107
+ :::MLLOG {"namespace": "", "time_ms": 1755471012699, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
108
+ :::MLLOG {"namespace": "", "time_ms": 1755471012700, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
109
+ :::MLLOG {"namespace": "", "time_ms": 1755471012701, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
110
+ :::MLLOG {"namespace": "", "time_ms": 1755471012701, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
111
+ :::MLLOG {"namespace": "", "time_ms": 1755471012703, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
112
+ :::MLLOG {"namespace": "", "time_ms": 1755471012704, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
113
+ :::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
114
+ :::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
115
+ :::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
116
+ :::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
117
+ :::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
118
+ :::MLLOG {"namespace": "", "time_ms": 1755471012712, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
119
+ :::MLLOG {"namespace": "", "time_ms": 1755471012713, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
120
+ :::MLLOG {"namespace": "", "time_ms": 1755471012714, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
121
+ :::MLLOG {"namespace": "", "time_ms": 1755471012718, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
122
+ :::MLLOG {"namespace": "", "time_ms": 1755471012719, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
123
+ :::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_base_learning_rate", "value": 0.00055, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 162}}
124
+ :::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_adamw_weight_decay", "value": 0.0001, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 163}}
125
+ :::MLLOG {"namespace": "", "time_ms": 1755471012726, "event_type": "POINT_IN_TIME", "key": "opt_gradient_clip_norm", "value": 0.3, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 164}}
126
+ :::MLLOG {"namespace": "", "time_ms": 1755471012727, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_warmup_factor", "value": 0.0, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 189}}
127
+ :::MLLOG {"namespace": "", "time_ms": 1755471012728, "event_type": "POINT_IN_TIME", "key": "lora_rank", "value": 16, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 198}}
128
+ :::MLLOG {"namespace": "", "time_ms": 1755471012728, "event_type": "POINT_IN_TIME", "key": "lora_alpha", "value": 32, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 199}}
129
+ :::MLLOG {"namespace": "", "time_ms": 1755471012731, "event_type": "POINT_IN_TIME", "key": "opt_learning_rate_training_steps", "value": 800, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 350}}
130
+ :::MLLOG {"namespace": "", "time_ms": 1755471012734, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
131
+ :::MLLOG {"namespace": "", "time_ms": 1755471012745, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
132
+ :::MLLOG {"namespace": "", "time_ms": 1755471013232, "event_type": "POINT_IN_TIME", "key": "global_batch_size", "value": 8, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
133
+ :::MLLOG {"namespace": "", "time_ms": 1755471013270, "event_type": "POINT_IN_TIME", "key": "train_samples", "value": 3901, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
134
+ :::MLLOG {"namespace": "", "time_ms": 1755471013270, "event_type": "POINT_IN_TIME", "key": "eval_samples", "value": 173, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
135
+ :::MLLOG {"namespace": "", "time_ms": 1755471013271, "event_type": "POINT_IN_TIME", "key": "gradient_accumulation_steps", "value": 1, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 328}}
136
+ :::MLLOG {"namespace": "", "time_ms": 1755471013272, "event_type": "POINT_IN_TIME", "key": "seed", "value": 1, "metadata": {"file": "/workspace/ft-llm/train.py", "lineno": 393}}
137
+ :::MLLOG {"namespace": "", "time_ms": 1755471019434, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"before_model_init": 6.738934484000083}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
138
+ :::MLLOG {"namespace": "", "time_ms": 1755471019889, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"after_model_init": 0.4552682889998323}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
139
+ Loading distributed checkpoint with TensorStoreLoadShardedStrategy
140
+ :::MLLOG {"namespace": "", "time_ms": 1755471283851, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"warmup_time": 263.961781042}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
141
+ :::MLLOG {"namespace": "", "time_ms": 1755471283851, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"init_finished": 0.00043348400004106225}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 168, "step": 0}}
142
+ :::MLLOG {"namespace": "", "time_ms": 1755471283852, "event_type": "INTERVAL_END", "key": "init_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 83}}
143
+ :::MLLOG {"namespace": "", "time_ms": 1755471283853, "event_type": "INTERVAL_START", "key": "run_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 83}}
144
+ :::MLLOG {"namespace": "", "time_ms": 1755471283854, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 0}}
145
+ :::MLLOG {"namespace": "", "time_ms": 1755471298842, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 2.231356143951416, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 80, "lr": 0.0005497879849661988}}
146
+ :::MLLOG {"namespace": "", "time_ms": 1755471314118, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.523491382598877, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 160, "lr": 0.0005491522667766103}}
147
+ :::MLLOG {"namespace": "", "time_ms": 1755471329437, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3142896890640259, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 240, "lr": 0.0005480938256626048}}
148
+ :::MLLOG {"namespace": "", "time_ms": 1755471344748, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3286864757537842, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 320, "lr": 0.0005466142936636629}}
149
+ :::MLLOG {"namespace": "", "time_ms": 1755471360056, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3191031217575073, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 400, "lr": 0.0005447159521108884}}
150
+ :::MLLOG {"namespace": "", "time_ms": 1755471375386, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3337409496307373, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 480, "lr": 0.0005424017281093611}}
151
+ :::MLLOG {"namespace": "", "time_ms": 1755471390691, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3527849912643433, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 560, "lr": 0.000539675190024753}}
152
+ :::MLLOG {"namespace": "", "time_ms": 1755471405987, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3663201332092285, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 640, "lr": 0.0005365405419811673}}
153
+ :::MLLOG {"namespace": "", "time_ms": 1755471421321, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.436583161354065, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 720, "lr": 0.0005330026173786832}}
154
+ :::MLLOG {"namespace": "", "time_ms": 1755471436648, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2180163860321045, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 800, "lr": 0.0005290668714406038}}
155
+ :::MLLOG {"namespace": "", "time_ms": 1755471451982, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3267669677734375, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 880, "lr": 0.0005247393728018974}}
156
+ :::MLLOG {"namespace": "", "time_ms": 1755471467318, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3912932872772217, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 960, "lr": 0.0005200267941518012}}
157
+ :::MLLOG {"namespace": "", "time_ms": 1755471482641, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.320623517036438, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1040, "lr": 0.0005149364019450193}}
158
+ :::MLLOG {"namespace": "", "time_ms": 1755471497959, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2857550382614136, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1120, "lr": 0.0005094760451973754}}
159
+ :::MLLOG {"namespace": "", "time_ms": 1755471513310, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.356960415840149, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1200, "lr": 0.0005036541433832}}
160
+ :::MLLOG {"namespace": "", "time_ms": 1755471528663, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.325950026512146, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1280, "lr": 0.0004974796734531106}}
161
+ :::MLLOG {"namespace": "", "time_ms": 1755471544018, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3832132816314697, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1360, "lr": 0.0004909621559922049}}
162
+ :::MLLOG {"namespace": "", "time_ms": 1755471559347, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2853686809539795, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1440, "lr": 0.0004841116405400086}}
163
+ :::MLLOG {"namespace": "", "time_ms": 1755471574682, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3469711542129517, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1520, "lr": 0.00047693869009481353}}
164
+ :::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.220374784959472, "train_step_time": 1.5324570226354173, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 0}}
165
+ :::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 0}}
166
+ :::MLLOG {"namespace": "", "time_ms": 1755471578084, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 1536}}
167
+ :::MLLOG {"namespace": "", "time_ms": 1755471592941, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9386916849654534, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 1536}}
168
+ :::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 11.84517688090803}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 1536}}
169
+ :::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 1536}}
170
+ :::MLLOG {"namespace": "", "time_ms": 1755471592942, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 1536}}
171
+ :::MLLOG {"namespace": "", "time_ms": 1755471605223, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2639729976654053, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1600, "lr": 0.0004694543648263006}}
172
+ :::MLLOG {"namespace": "", "time_ms": 1755471620533, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.359060525894165, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1680, "lr": 0.00046167020502155905}}
173
+ :::MLLOG {"namespace": "", "time_ms": 1755471635863, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3872255086898804, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1760, "lr": 0.00045359821329080054}}
174
+ :::MLLOG {"namespace": "", "time_ms": 1755471651197, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2579522132873535, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1840, "lr": 0.00044525083606020437}}
175
+ :::MLLOG {"namespace": "", "time_ms": 1755471666501, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2951648235321045, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 1920, "lr": 0.0004366409443804302}}
176
+ :::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.218929434605367, "train_step_time": 1.5328814271666669, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 1536}}
177
+ :::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 1536}}
178
+ :::MLLOG {"namespace": "", "time_ms": 1755471666520, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 1920}}
179
+ :::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9341272784106304, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 1920}}
180
+ :::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 11.889916819247087}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 1920}}
181
+ :::MLLOG {"namespace": "", "time_ms": 1755471681322, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 1920}}
182
+ :::MLLOG {"namespace": "", "time_ms": 1755471681323, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 1920}}
183
+ :::MLLOG {"namespace": "", "time_ms": 1755471696647, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.311585783958435, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2000, "lr": 0.0004277818140803907}}
184
+ :::MLLOG {"namespace": "", "time_ms": 1755471711976, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3409117460250854, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2080, "lr": 0.00041868710529688595}}
185
+ :::MLLOG {"namespace": "", "time_ms": 1755471727298, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3523324728012085, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2160, "lr": 0.00040937084141166267}}
186
+ :::MLLOG {"namespace": "", "time_ms": 1755471742649, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2437388896942139, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2240, "lr": 0.0003998473874283754}}
187
+ :::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.21650721148526, "train_step_time": 1.5335932024375012, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 1920}}
188
+ :::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 1920}}
189
+ :::MLLOG {"namespace": "", "time_ms": 1755471754935, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 2304}}
190
+ :::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9300852869287392, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 2304}}
191
+ :::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.04804375654437}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 2304}}
192
+ :::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 2304}}
193
+ :::MLLOG {"namespace": "", "time_ms": 1755471769543, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 2304}}
194
+ :::MLLOG {"namespace": "", "time_ms": 1755471772623, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3073469400405884, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2320, "lr": 0.00039013142782279276}}
195
+ :::MLLOG {"namespace": "", "time_ms": 1755471787980, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.260759711265564, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2400, "lr": 0.00038023794390039975}}
196
+ :::MLLOG {"namespace": "", "time_ms": 1755471803328, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2937121391296387, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2480, "lr": 0.00037018219069631056}}
197
+ :::MLLOG {"namespace": "", "time_ms": 1755471818672, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3011728525161743, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2560, "lr": 0.00035997967345311057}}
198
+ :::MLLOG {"namespace": "", "time_ms": 1755471834026, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3377065658569336, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2640, "lr": 0.00034964612371289557}}
199
+ :::MLLOG {"namespace": "", "time_ms": 1755471843261, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.2090510322980235, "train_step_time": 1.5357883711250035, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 2304}}
200
+ :::MLLOG {"namespace": "", "time_ms": 1755471843262, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 2304}}
201
+ :::MLLOG {"namespace": "", "time_ms": 1755471843262, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 2688}}
202
+ :::MLLOG {"namespace": "", "time_ms": 1755471857902, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9281238710260116, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 2688}}
203
+ :::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.02044647758139}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 2688}}
204
+ :::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 2688}}
205
+ :::MLLOG {"namespace": "", "time_ms": 1755471857903, "event_type": "INTERVAL_START", "key": "block_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 199, "samples_count": 2688}}
206
+ :::MLLOG {"namespace": "", "time_ms": 1755471864050, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3667250871658325, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2720, "lr": 0.000339197475060374}}
207
+ :::MLLOG {"namespace": "", "time_ms": 1755471879394, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3398393392562866, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2800, "lr": 0.00032864983855443534}}
208
+ :::MLLOG {"namespace": "", "time_ms": 1755471894731, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2728362083435059, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2880, "lr": 0.0003180194778860635}}
209
+ :::MLLOG {"namespace": "", "time_ms": 1755471910087, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.2875945568084717, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 2960, "lr": 0.0003073227843009054}}
210
+ :::MLLOG {"namespace": "", "time_ms": 1755471925418, "event_type": "POINT_IN_TIME", "key": "train_loss", "value": 1.3862403631210327, "metadata": {"file": "/workspace/ft-llm/custom_callbacks.py", "lineno": 71, "samples_count": 3040, "lr": 0.0002965762513251574}}
211
+ :::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"throughput": 5.212281417349078, "train_step_time": 1.5348365445833376, "max_memory_usage": 168.827}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 225, "step": 2688}}
212
+ :::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "INTERVAL_END", "key": "block_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 208, "samples_count": 2688}}
213
+ :::MLLOG {"namespace": "", "time_ms": 1755471931575, "event_type": "INTERVAL_START", "key": "eval_start", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 130, "samples_count": 3072}}
214
+ :::MLLOG {"namespace": "", "time_ms": 1755471946223, "event_type": "POINT_IN_TIME", "key": "eval_accuracy", "value": 0.9221087328960441, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 303, "samples_count": 3072}}
215
+ :::MLLOG {"namespace": "", "time_ms": 1755471946223, "event_type": "POINT_IN_TIME", "key": "tracked_stats", "value": {"validation_throughput": 12.014723364925022}, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 245, "step": 3072}}
216
+ :::MLLOG {"namespace": "", "time_ms": 1755471946224, "event_type": "INTERVAL_END", "key": "eval_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 153, "samples_count": 3072}}
217
+ :::MLLOG {"namespace": "", "time_ms": 1755471946230, "event_type": "INTERVAL_END", "key": "run_stop", "value": null, "metadata": {"file": "/usr/local/lib/python3.12/dist-packages/mlperf_common/callbacks/logging.py", "lineno": 106, "samples_count": 3072, "status": "success"}}
first_run_output/nemo_experiments/default/2025-08-17_22-50-13/nemo_log_globalrank-0_localrank-0.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-1_localrank-1.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-2_localrank-2.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-3_localrank-3.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-4_localrank-4.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-5_localrank-5.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-6_localrank-6.txt ADDED
File without changes
first_run_output/nemo_experiments/default/nemo_log_globalrank-7_localrank-7.txt ADDED
File without changes
first_run_output/outputs/2025-08-17/22-50-12/.hydra/config.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ skip_evals: ${oc.decode:${oc.env:SKIP_EVAL,${floor:${add:${multiply:0.125,${model.global_batch_size}},2}}}}
2
+ load_ckpt: ${oc.decode:${oc.env:LOAD_CKPT,False}}
3
+ data_root: ${oc.decode:${oc.env:DATA_ROOT,/data}}
4
+ ckpt_root: ${oc.decode:${oc.env:CKPT_ROOT,/ckpt}}
5
+ trainer:
6
+ devices: ${oc.decode:${oc.env:DGXNGPU,8}}
7
+ num_nodes: ${oc.decode:${oc.env:DGXNNODES,1}}
8
+ max_steps: ${oc.decode:${oc.env:MAX_STEPS,null}}
9
+ val_check_interval: ${floor_div:${multiply:${add:${skip_evals},1},${oc.decode:${oc.env:VAL_CHECK_INTERVAL,384}}},${model.global_batch_size}}
10
+ limit_val_batches: ${oc.decode:${oc.env:LIMIT_VAL_BATCHES,1.0}}
11
+ model:
12
+ num_layers: ${oc.decode:${oc.env:OVERWRITTEN_NUM_LAYERS,80}}
13
+ seed: ${oc.decode:${oc.env:SEED,1}}
14
+ tensor_model_parallel_size: ${oc.decode:${oc.env:TP,1}}
15
+ pipeline_model_parallel_size: ${oc.decode:${oc.env:PP,1}}
16
+ context_parallel_size: ${oc.decode:${oc.env:CP,1}}
17
+ eval_cp: ${oc.decode:${oc.env:CP_EVAL,null}}
18
+ global_batch_size: ${floor_div:${multiply:${oc.decode:${oc.env:MINIBS,1}},${floor_div:${multiply:${trainer.devices},${trainer.num_nodes}},${multiply:${model.tensor_model_parallel_size},${model.pipeline_model_parallel_size}}}},${oc.decode:${oc.env:CP,1}}}
19
+ micro_batch_size: ${oc.decode:${oc.env:MBS,1}}
20
+ val_micro_batch_size: ${oc.decode:${oc.env:VAL_MBS,null}}
21
+ val_global_batch_size: ${floor_div:${multiply:${oc.decode:${oc.env:VAL_MBS,1}},${floor_div:${multiply:${trainer.devices},${trainer.num_nodes}},${multiply:${model.tensor_model_parallel_size},${model.pipeline_model_parallel_size}}}},${oc.decode:${oc.env:CP_EVAL,${oc.env:CP,1}}}}
22
+ max_position_embeddings: ${oc.decode:${oc.env:MAX_SEQLEN,8192}}
23
+ encoder_seq_length: ${oc.decode:${oc.env:MAX_SEQLEN,8192}}
24
+ sequence_parallel: ${oc.decode:${oc.env:SP,False}}
25
+ ub_tp_comm_overlap: ${oc.decode:${oc.env:TP_COMM_OVERLAP,False}}
26
+ fp8: ${oc.decode:${oc.env:FP8,True}}
27
+ fp8_params: ${oc.decode:${oc.env:FP8,True}}
28
+ fp8_hybrid: ${oc.decode:${oc.env:FP8_HYBRID,True}}
29
+ fp8_amax_history_len: ${oc.decode:${oc.env:FP8_AMAX_HISTORY,128}}
30
+ fp8_amax_compute_algo: ${oc.env:FP8_AMAX_ALGO,most_recent}
31
+ reduce_amax: ${oc.decode:${oc.env:FP8_REDUCE_AMAX,False}}
32
+ fp8_e4m3: ${oc.decode:${oc.env:FP8_E4M3,False}}
33
+ fp8_interval: ${oc.decode:${oc.env:FP8_INTERVAL,1}}
34
+ fp8_margin: ${oc.decode:${oc.env:FP8_MARGIN,0}}
35
+ fp8_dot_product_attention: ${oc.decode:${oc.env:FP8_DPA,0}}
36
+ cp_comm_type: ${oc.decode:${oc.env:CP_COMM_TYPE,'a2a'}}
37
+ activation_func_fp8_input_store: ${oc.decode:${oc.env:FP8_ACT,0}}
38
+ external_cuda_graph: ${oc.decode:${oc.env:LAYER_CUDA_GRAPH,False}}
39
+ enable_cuda_graph: ${oc.decode:${oc.env:MCORE_CUDA_GRAPH,False}}
40
+ use_te_rng_tracker: ${oc.decode:${oc.env:USE_TE_RNG_TRACKER,True}}
41
+ enable_cg_fp8_weight_caching: ${oc.decode:${oc.env:CG_WEIGHT_CACHING,True}}
42
+ cpu_offloading: ${oc.decode:${oc.env:CPU_OFFLOADING,False}}
43
+ cpu_offloading_num_layers: ${oc.decode:${oc.env:CPU_OFFLOADING_NUM_LAYERS,20}}
44
+ cpu_offloading_activations: true
45
+ cpu_offloading_weights: false
46
+ memory_profile:
47
+ enabled: ${oc.decode:${oc.env:MEMORY_PROFILE,False}}
48
+ start_step: 1
49
+ end_step: 4
50
+ rank: 0
51
+ output_path: /results/
52
+ custom:
53
+ warmup: ${oc.decode:${oc.env:WARMUP,False}}
54
+ warmup_train_steps: ${oc.decode:${oc.env:WARMUP_TRAIN_STEPS,5}}
55
+ warmup_validation_steps: ${oc.decode:${oc.env:WARMUP_VALIDATION_STEPS,5}}
56
+ reset_fp8_stats_after_warmup: ${oc.decode:${oc.env:RESET_FP8_STATS_AFTER_WARMUP,1}}
57
+ optim:
58
+ lr: ${oc.decode:${oc.env:LR,0.0004}}
59
+ use_distributed_optimizer: ${oc.decode:${oc.env:USE_DISTRIBUTED_OPTIMIZER,True}}
60
+ overlap_param_gather_with_optimizer_step: ${oc.decode:${oc.env:OVERLAP_PARAM_GATHER_WITH_OPTIMIZER_STEP,False}}
61
+ sched:
62
+ warmup_steps: ${oc.decode:${oc.env:WARMUP_STEPS,0}}
63
+ ddp:
64
+ overlap_grad_reduce: ${oc.decode:${oc.env:DDP_OVERLAP_GRAD_REDUCE,False}}
65
+ overlap_param_gather: ${oc.decode:${oc.env:DDP_OVERLAP_PARAM_GATHER,False}}
66
+ fp8_param_gather: ${oc.decode:${oc.env:DDP_FP8_PARAM_GATHER,False}}
67
+ average_in_collective: ${oc.decode:${oc.env:DDP_AVERAGE_IN_COLLECTIVE,False}}
first_run_output/outputs/2025-08-17/22-50-12/.hydra/hydra.yaml ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: null
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: megatron_gpt_peft_tuning_config
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.3'
132
+ cwd: /workspace/ft-llm
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/ft-llm/conf
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/ft-llm/outputs/2025-08-17/22-50-12
144
+ choices:
145
+ tp_overlap@model.ub_tp_comm_overlap_cfg: b100tp1mbs1
146
+ hydra/env: default
147
+ hydra/callbacks: null
148
+ hydra/job_logging: default
149
+ hydra/hydra_logging: default
150
+ hydra/hydra_help: default
151
+ hydra/help: default
152
+ hydra/sweeper: basic
153
+ hydra/launcher: basic
154
+ hydra/output: default
155
+ verbose: false
first_run_output/outputs/2025-08-17/22-50-12/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
first_run_output/outputs/2025-08-17/22-50-12/train.log ADDED
File without changes
first_run_output/run_docker.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ docker run -d --gpus all -it --rm --network=host --ipc=host \
2
+ -v /home/shadeform/work/dev/mlperf/data:/data \
3
+ -v /home/shadeform/work/dev/mlperf/model:/ckpt \
4
+ -v /home/shadeform/work/dev/mlperf:/myworkspace \
5
+ --shm-size=16g \
6
+ --ulimit memlock=-1 --ulimit stack=67108864 \
7
+ vuiseng9/mlperf-nvidia:llama2_70b_lora-pyt_v5.0