switchoff commited on
Commit
aa2de13
·
verified ·
1 Parent(s): 63b9e03

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log filter=lfs diff=lfs merge=lfs -text
37
+ log_0.txt filter=lfs diff=lfs merge=lfs -text
38
+ wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb filter=lfs diff=lfs merge=lfs -text
latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 43200
latest_wandb_artifact_path.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ adamo1139-no/poziomka
log_0.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8549446fab3880bb4addaebfd74001956042f3d33c467cc48fa2821e2e63b9d5
3
+ size 16532379
pip_list.txt ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Package Version
2
+ ------------------------ ----------------
3
+ aiohappyeyeballs 2.6.1
4
+ aiohttp 3.12.15
5
+ aiosignal 1.4.0
6
+ annotated-types 0.7.0
7
+ apex 0.1
8
+ async-timeout 5.0.1
9
+ attrs 21.2.0
10
+ Automat 20.2.0
11
+ Babel 2.8.0
12
+ bcrypt 3.2.0
13
+ bitsandbytes 0.47.0
14
+ blinker 1.4
15
+ certifi 2020.6.20
16
+ chardet 4.0.0
17
+ charset-normalizer 3.4.3
18
+ click 8.0.3
19
+ cloud-init 25.1.2
20
+ colorama 0.4.4
21
+ command-not-found 0.3
22
+ configobj 5.0.6
23
+ constantly 15.1.0
24
+ cryptography 3.4.8
25
+ datasets 4.1.1
26
+ dbus-python 1.2.18
27
+ dill 0.4.0
28
+ distro 1.7.0
29
+ distro-info 1.1+ubuntu0.2
30
+ einops 0.8.1
31
+ filelock 3.19.1
32
+ flash_attn_3 3.0.0b1
33
+ frozenlist 1.7.0
34
+ fsspec 2025.9.0
35
+ gitdb 4.0.12
36
+ GitPython 3.1.45
37
+ hf_transfer 0.1.9
38
+ hf-xet 1.1.10
39
+ httplib2 0.20.2
40
+ huggingface-hub 0.35.0
41
+ hyperlink 21.0.0
42
+ idna 3.3
43
+ importlib-metadata 4.6.4
44
+ incremental 21.3.0
45
+ jeepney 0.7.1
46
+ Jinja2 3.0.3
47
+ jsonpatch 1.32
48
+ jsonpointer 2.0
49
+ jsonschema 3.2.0
50
+ keyring 23.5.0
51
+ launchpadlib 1.10.16
52
+ lazr.restfulclient 0.14.4
53
+ lazr.uri 1.0.6
54
+ MarkupSafe 2.0.1
55
+ megatron-core 0.13.0
56
+ ml_dtypes 0.5.3
57
+ more-itertools 8.10.0
58
+ mpmath 1.3.0
59
+ multidict 6.6.4
60
+ multiprocess 0.70.16
61
+ netifaces 0.11.0
62
+ networkx 3.3
63
+ ninja 1.13.0
64
+ numpy 1.26.4
65
+ nvidia-cublas-cu12 12.9.1.4
66
+ nvidia-cuda-cupti-cu12 12.9.79
67
+ nvidia-cuda-nvrtc-cu12 12.9.86
68
+ nvidia-cuda-runtime-cu12 12.9.79
69
+ nvidia-cudnn-cu12 9.10.2.21
70
+ nvidia-cufft-cu12 11.4.1.4
71
+ nvidia-cufile-cu12 1.14.1.1
72
+ nvidia-curand-cu12 10.3.10.19
73
+ nvidia-cusolver-cu12 11.7.5.82
74
+ nvidia-cusparse-cu12 12.5.10.65
75
+ nvidia-cusparselt-cu12 0.7.1
76
+ nvidia-nccl-cu12 2.27.3
77
+ nvidia-nvjitlink-cu12 12.9.86
78
+ nvidia-nvtx-cu12 12.9.79
79
+ oauthlib 3.2.0
80
+ onnx 1.19.0
81
+ onnx-ir 0.1.9
82
+ onnxscript 0.3.1
83
+ packaging 24.2
84
+ pandas 2.3.2
85
+ pexpect 4.8.0
86
+ pillow 11.0.0
87
+ pip 25.2
88
+ platformdirs 4.4.0
89
+ propcache 0.3.2
90
+ protobuf 6.32.1
91
+ psutil 7.1.0
92
+ psutils 3.3.11
93
+ ptyprocess 0.7.0
94
+ puremagic 1.30
95
+ pyarrow 21.0.0
96
+ pyasn1 0.4.8
97
+ pyasn1-modules 0.2.1
98
+ pybind11 3.0.1
99
+ pydantic 2.11.9
100
+ pydantic_core 2.33.2
101
+ PyGObject 3.42.1
102
+ PyHamcrest 2.0.2
103
+ PyJWT 2.3.0
104
+ pyOpenSSL 21.0.0
105
+ pyparsing 2.4.7
106
+ pypdf 6.1.0
107
+ pyrsistent 0.18.1
108
+ pyserial 3.5
109
+ python-apt 2.4.0+ubuntu4
110
+ python-dateutil 2.9.0.post0
111
+ python-debian 0.1.43+ubuntu1.1
112
+ python-magic 0.4.24
113
+ pytz 2022.1
114
+ PyYAML 5.4.1
115
+ regex 2025.9.18
116
+ requests 2.32.5
117
+ safetensors 0.6.2
118
+ SecretStorage 3.3.1
119
+ sentry-sdk 2.38.0
120
+ service-identity 18.1.0
121
+ setuptools 80.9.0
122
+ six 1.16.0
123
+ smmap 5.0.2
124
+ sos 4.8.2
125
+ ssh-import-id 5.11
126
+ sympy 1.13.3
127
+ systemd-python 234
128
+ tiktoken 0.11.0
129
+ tokenizers 0.22.1
130
+ torch 2.8.0+cu129
131
+ torchvision 0.23.0+cu129
132
+ tqdm 4.67.1
133
+ transformer_engine 2.6.0.post1
134
+ transformer_engine_cu12 2.6.0.post1
135
+ transformer_engine_torch 2.6.0.post1
136
+ transformers 4.56.2
137
+ triton 3.4.0
138
+ Twisted 22.1.0
139
+ typing_extensions 4.15.0
140
+ typing-inspection 0.4.1
141
+ tzdata 2025.2
142
+ ubuntu-drivers-common 0.0.0
143
+ ubuntu-pro-client 8001
144
+ ufw 0.36.1
145
+ unattended-upgrades 0.1
146
+ urllib3 2.5.0
147
+ wadllib 1.3.6
148
+ wandb 0.22.0
149
+ wheel 0.45.1
150
+ xkit 0.0.0
151
+ xxhash 3.5.0
152
+ yarl 1.20.1
153
+ zipp 1.0.0
154
+ zope.interface 5.4.0
run_pretrain_poziomka_5.sh ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -ex
3
+
4
+ MODEL_PATH="" # no checkpoint needed for from-scratch training
5
+ JOB_DIR="poziomka_5"
6
+ DATA_PATH="szypulka_tokenized_apt4_merged/apt4_merged_text_document"
7
+ MEGATRON_PATH="Megatron-LM-core_v0.13.0"
8
+
9
+
10
+ mkdir -p ${JOB_DIR}
11
+ CHECKPOINT_PATH=${JOB_DIR}
12
+ TENSORBOARD_LOGS_PATH=${JOB_DIR}/runs
13
+
14
+ if [[ $RANK -eq 0 ]]; then
15
+ cp -r ${0} ${JOB_DIR}
16
+ pip list > ${JOB_DIR}/pip_list.txt
17
+ python -m torch.utils.collect_env > ${JOB_DIR}/collect_env.txt
18
+ fi
19
+
20
+
21
+ GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
22
+ WORLD_SIZE=${WORLD_SIZE:-1}
23
+ NODE_RANK=${RANK:-0}
24
+ MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
25
+ RANDOM_PORT=$[$RANDOM + 20000]
26
+ MASTER_PORT=${MASTER_PORT:-$RANDOM_PORT}
27
+ GPU_NUM=$((${GPUS_PER_NODE}*${WORLD_SIZE}))
28
+ echo "---> from pytorch runtime, WORLD_SIZE: ${WORLD_SIZE}, NODE_RANK: ${NODE_RANK}, MASTER_ADDR: ${MASTER_ADDR}, MASTER_PORT: ${MASTER_PORT}"
29
+ LAUNCHER=" \
30
+ torchrun \
31
+ --nproc_per_node ${GPUS_PER_NODE} \
32
+ --nnodes ${WORLD_SIZE} \
33
+ --node_rank ${NODE_RANK} \
34
+ --master_addr ${MASTER_ADDR} \
35
+ --master_port ${MASTER_PORT} \
36
+ "
37
+
38
+ LOG_PATH="${JOB_DIR}/log_${NODE_RANK}.txt"
39
+
40
+ export OMP_NUM_THREADS=1
41
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # needed to keep at 1 as per https://github.com/NVIDIA/Megatron-LM/issues/533
42
+ export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
43
+ export NCCL_NVLS_ENABLE=0
44
+ export NCCL_CUMEM_ENABLE=0
45
+
46
+ export NVTE_FLASH_ATTN=1 # get that sweet FA3 boost
47
+ export NVTE_FUSED_ATTN=0
48
+ export NVTE_UNFUSED_ATTN=0
49
+
50
+ export NVTE_DEBUG=1
51
+ export NVTE_DEBUG_LEVEL=2 # 2 means DEBUG level
52
+
53
+ export NCCL_DEBUG=OFF
54
+
55
+ DEVICE_MODEL=$(nvidia-smi -i 0 -q | grep "Product Name" | awk -F: '{ print $2 }')
56
+ DEVICE_MODEL=$(echo "$DEVICE_MODEL" | xargs) # drop white space
57
+
58
+ if [[ $DEVICE_MODEL == NVIDIA* ]]; then
59
+ DEVICE_MODEL=${DEVICE_MODEL#"NVIDIA"}
60
+ DEVICE_MODEL=$(echo "$DEVICE_MODEL" | sed 's/^ *//')
61
+ fi
62
+
63
+ if [ "$DEVICE_MODEL" = "NVIDIA GeForce RTX 3090 Ti" ] || [ "$DEVICE_MODEL" = "A100-SXM4-80GB" ]; then
64
+ # Ampere GPUs do not support multicast. If `--tp-comm-overlap` is set on Ampere-arch GPUs, this env must be set.
65
+ export UB_SKIPMC=1
66
+ fi
67
+
68
+ MOE_ARGS=(
69
+ --expert-model-parallel-size 2
70
+ --expert-tensor-parallel-size 1
71
+ --moe-grouped-gemm
72
+ --moe-token-dispatcher-type alltoall
73
+ --moe-router-dtype fp32
74
+ --num-experts 128
75
+ --moe-ffn-hidden-size 320
76
+ --moe-shared-expert-intermediate-size 320
77
+ --moe-router-score-function sigmoid
78
+ --moe-router-topk 4
79
+ --moe-router-enable-expert-bias
80
+ --moe-router-topk-scaling-factor 2.5
81
+ --moe-router-num-groups 8
82
+ --moe-router-group-topk 2
83
+ --moe-z-loss-coeff 0.0000035
84
+ --moe-router-bias-update-rate 1e-3
85
+ --moe-layer-freq [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
86
+ --bias-zero-mean-update
87
+ --moe-expert-capacity-factor 1.25
88
+ --moe-pad-expert-input-to-capacity
89
+ --moe-shared-expert-overlap
90
+ )
91
+
92
+ MPT_ARGS=(
93
+ --mtp-num-layers 0
94
+ )
95
+
96
+ GPT_MODEL_ARGS=(
97
+ --num-layers 16
98
+ --hidden-size 2048
99
+ --ffn-hidden-size 2048
100
+ --num-attention-heads 16
101
+ --num-query-groups 4
102
+ --group-query-attention
103
+ --qk-layernorm
104
+ --use-flash-attn
105
+ --max-position-embeddings 8192
106
+ --vocab-size 32000
107
+ --make-vocab-size-divisible-by 128
108
+ --position-embedding-type "rope"
109
+ --rotary-base 84000
110
+ --rotary-percent 0.5
111
+ --rotary-scaling-factor 40
112
+ --swiglu
113
+ --untie-embeddings-and-output-weights
114
+ --normalization "RMSNorm"
115
+ --norm-epsilon "1e-06"
116
+ --disable-bias-linear
117
+ --transformer-impl "transformer_engine"
118
+ --attention-dropout 0
119
+ --hidden-dropout 0
120
+ )
121
+
122
+ TRAINING_ARGS=(
123
+ --micro-batch-size 8
124
+ --global-batch-size 256
125
+ --seq-length 8192
126
+ --train-iters 50000
127
+ --weight-decay 0.1
128
+ --adam-beta1 0.9
129
+ --adam-beta2 0.95
130
+ --init-method-std 0.02
131
+ --clip-grad 1.0
132
+
133
+ --bf16
134
+
135
+ --optimizer "adam"
136
+ --lr "8.0e-4"
137
+ --lr-decay-style cosine
138
+ --min-lr "4.00e-5"
139
+ --lr-warmup-iters 100
140
+ --seed 50
141
+ )
142
+
143
+ MODEL_PARALLEL_ARGS=(
144
+ --pipeline-model-parallel-size 1
145
+ --tensor-model-parallel-size 4
146
+ --sequence-parallel
147
+ --overlap-grad-reduce
148
+ )
149
+
150
+ DATA_ARGS=(
151
+ --data-path ${DATA_PATH}
152
+ --tokenizer-type "HuggingFaceTokenizer"
153
+ --tokenizer-model `dirname $(readlink -f "${BASH_SOURCE[0]}")`/../../resource/tokenizer/apt4
154
+ --split 9999,1,0
155
+ --dataloader-type "single"
156
+ --no-create-attention-mask-in-dataloader
157
+ --eod-mask-loss
158
+ )
159
+
160
+ EVAL_AND_LOGGING_ARGS=(
161
+ --save-interval 1600
162
+ --eval-interval 1600
163
+ --eval-iters 2
164
+ --save $CHECKPOINT_PATH
165
+ --ckpt-format "torch_dist"
166
+ --async-save
167
+ --log-interval 1
168
+ --log-throughput
169
+ --tensorboard-dir $TENSORBOARD_LOGS_PATH
170
+ --log-timers-to-tensorboard
171
+ --log-memory-to-tensorboard
172
+ --log-world-size-to-tensorboard
173
+ --log-validation-ppl-to-tensorboard
174
+
175
+ --wandb-project "poziomka"
176
+ --wandb-exp-name ${JOB_DIR}
177
+
178
+ )
179
+
180
+ KERNEL_ARGS=(
181
+ --attention-backend flash
182
+ --no-masked-softmax-fusion
183
+ --attention-softmax-in-fp32
184
+ --cross-entropy-loss-fusion
185
+ )
186
+
187
+ CMD="${LAUNCHER} ${MEGATRON_PATH}/pretrain_gpt.py \
188
+ ${MOE_ARGS[@]} \
189
+ ${GPT_MODEL_ARGS[@]} \
190
+ ${TRAINING_ARGS[@]} \
191
+ ${MODEL_PARALLEL_ARGS[@]} \
192
+ ${DATA_ARGS[@]} \
193
+ ${EVAL_AND_LOGGING_ARGS[@]} \
194
+ ${KERNEL_ARGS[@]} \
195
+ ${MPT_ARGS[@]} \
196
+ ${PROFILING_ARGS[@]} \
197
+ "
198
+
199
+ echo ${CMD}
200
+ PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH ${CMD} 2>&1 | tee ${LOG_PATH}
wandb/wandb/debug-internal.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-22T22:04:06.143255097Z","level":"INFO","msg":"stream: starting","core version":"0.22.0"}
2
+ {"time":"2025-09-22T22:04:06.487372274Z","level":"INFO","msg":"stream: created new stream","id":"hrldy3bw"}
3
+ {"time":"2025-09-22T22:04:06.487645224Z","level":"INFO","msg":"stream: started","id":"hrldy3bw"}
4
+ {"time":"2025-09-22T22:04:06.487690464Z","level":"INFO","msg":"sender: started","stream_id":"hrldy3bw"}
5
+ {"time":"2025-09-22T22:04:06.487691275Z","level":"INFO","msg":"writer: started","stream_id":"hrldy3bw"}
6
+ {"time":"2025-09-22T22:04:06.487752261Z","level":"INFO","msg":"handler: started","stream_id":"hrldy3bw"}
7
+ {"time":"2025-09-23T20:24:34.768930029Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
8
+ {"time":"2025-09-24T15:33:04.643961764Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
9
+ {"time":"2025-09-24T15:48:05.092383968Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
10
+ {"time":"2025-09-24T16:41:14.895690245Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
11
+ {"time":"2025-09-25T16:26:04.894084919Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-09-25T17:26:05.511033911Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
13
+ {"time":"2025-09-25T22:42:38.263384097Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
14
+ {"time":"2025-09-25T23:02:46.550701182Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
15
+ {"time":"2025-09-26T12:20:37.296973248Z","level":"INFO","msg":"stream: closing","id":"hrldy3bw"}
16
+ {"time":"2025-09-26T12:20:38.848817943Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
17
+ {"time":"2025-09-26T12:20:39.03451843Z","level":"INFO","msg":"handler: closed","stream_id":"hrldy3bw"}
18
+ {"time":"2025-09-26T12:20:39.034655715Z","level":"INFO","msg":"sender: closed","stream_id":"hrldy3bw"}
19
+ {"time":"2025-09-26T12:20:39.034688219Z","level":"INFO","msg":"stream: closed","id":"hrldy3bw"}
wandb/wandb/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Current SDK version is 0.22.0
2
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Configure stats pid to 40865
3
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
4
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /home/ubuntu/training/Ling-V2/wandb/settings
5
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from environment variables
6
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():686] Logging user logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
7
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
8
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():813] calling init triggers
9
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
10
+ config: {'num_layers': 16, 'encoder_num_layers': 16, 'decoder_num_layers': None, 'hidden_size': 2048, 'ffn_hidden_size': 2048, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 4, 'max_position_embeddings': 8192, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 84000, 'rotary_percent': 0.5, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': True, 'multi_latent_attention': False, 'mtp_num_layers': 0, 'mtp_loss_scaling_factor': 0.1, 'bias_zero_mean_update': True, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 8, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': None, 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': None, 'recompute_num_layers': None, 'recompute_modules': None, 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': False, 'train_sync_interval': None, 'train_iters': 50000, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': 'poziomka_5/runs', 'masked_softmax_fusion': False, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': True, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'single', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': True, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 50, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'init_method_xavier_uniform': False, 'lr': 0.0008, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': None, 'lr_warmup_iters': 100, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 4e-05, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': 'poziomka_5', 'save_interval': 1600, 'no_save_optim': None, 'no_save_rng': None, 'load': None, 'no_load_optim': None, 'no_load_rng': None, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': False, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': True, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': False, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': False, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': True, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 4, 'encoder_tensor_model_parallel_size': 0, 'pipeline_model_parallel_size': 1, 'encoder_pipeline_model_parallel_size': 0, 'pipeline_model_parallel_split_rank': None, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 10, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': False, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': False, 'nccl_ub': False, 'use_sharp': False, 'use_custom_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache_when_using_custom_fsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'eval_iters': 2, 'eval_interval': 1600, 'test_mode': False, 'skip_train': False, 'data_path': ['szypulka_tokenized_apt4_merged/apt4_merged_text_document'], 'split': '9999,1,0', 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 8192, 'encoder_seq_length': 8192, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 2, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': True, 'create_attention_mask_in_dataloader': False, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': 32000, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': 'HuggingFaceTokenizer', 'tokenizer_model': '/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4', 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 2, 'expert_tensor_parallel_size': 1, 'num_experts': 128, 'moe_layer_freq': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'moe_ffn_hidden_size': 320, 'moe_shared_expert_intermediate_size': 320, 'moe_shared_expert_overlap': True, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'skip_casting_dtype_for_param_pattern': '["^expert_bias$|.+\\.expert_bias$"]', 'moe_router_score_function': 'sigmoid', 'moe_router_topk': 4, 'moe_router_pre_softmax': False, 'moe_router_num_groups': 8, 'moe_router_group_topk': 2, 'moe_router_topk_scaling_factor': 2.5, 'moe_router_enable_expert_bias': True, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': 3.5e-06, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': 1.25, 'moe_pad_expert_input_to_capacity': True, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 40.0, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': True, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 1000, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': True, 'wandb_project': 'poziomka', 'wandb_exp_name': 'poziomka_5', 'wandb_save_dir': '', 'logging_level': None, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': False, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'disabled', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, 'padded_vocab_size': 32256, '_wandb': {}}
11
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():861] starting backend
12
+ 2025-09-22 22:04:06,134 INFO MainThread:40865 [wandb_init.py:init():864] sending inform_init request
13
+ 2025-09-22 22:04:06,137 INFO MainThread:40865 [wandb_init.py:init():872] backend started and connected
14
+ 2025-09-22 22:04:06,140 INFO MainThread:40865 [wandb_init.py:init():942] updated telemetry
15
+ 2025-09-22 22:04:06,144 INFO MainThread:40865 [wandb_init.py:init():966] communicating run to backend with 90.0 second timeout
16
+ 2025-09-22 22:04:06,744 INFO MainThread:40865 [wandb_init.py:init():1017] starting run threads in backend
17
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_console_start():2506] atexit reg
18
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2354] redirect: wrap_raw
19
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2423] Wrapping output streams.
20
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2446] Redirects installed.
21
+ 2025-09-22 22:04:06,838 INFO MainThread:40865 [wandb_init.py:init():1057] run started, returning control to user process
22
+ 2025-09-26 12:20:37,273 INFO wandb-AsyncioManager-main:40865 [service_client.py:_forward_responses():84] Reached EOF.
23
+ 2025-09-26 12:20:37,275 INFO wandb-AsyncioManager-main:40865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
wandb/wandb/run-20250922_220405-hrldy3bw/files/config.yaml ADDED
@@ -0,0 +1,1288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.22.0
4
+ e:
5
+ 9vte3cwjfuxykvlnatinaorhdm7hrpxl:
6
+ args:
7
+ - --expert-model-parallel-size
8
+ - "2"
9
+ - --expert-tensor-parallel-size
10
+ - "1"
11
+ - --moe-grouped-gemm
12
+ - --moe-token-dispatcher-type
13
+ - alltoall
14
+ - --moe-router-dtype
15
+ - fp32
16
+ - --num-experts
17
+ - "128"
18
+ - --moe-ffn-hidden-size
19
+ - "320"
20
+ - --moe-shared-expert-intermediate-size
21
+ - "320"
22
+ - --moe-router-score-function
23
+ - sigmoid
24
+ - --moe-router-topk
25
+ - "4"
26
+ - --moe-router-enable-expert-bias
27
+ - --moe-router-topk-scaling-factor
28
+ - "2.5"
29
+ - --moe-router-num-groups
30
+ - "8"
31
+ - --moe-router-group-topk
32
+ - "2"
33
+ - --moe-z-loss-coeff
34
+ - "0.0000035"
35
+ - --moe-router-bias-update-rate
36
+ - "1e-3"
37
+ - --moe-layer-freq
38
+ - '[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'
39
+ - --bias-zero-mean-update
40
+ - --moe-expert-capacity-factor
41
+ - "1.25"
42
+ - --moe-pad-expert-input-to-capacity
43
+ - --moe-shared-expert-overlap
44
+ - --num-layers
45
+ - "16"
46
+ - --hidden-size
47
+ - "2048"
48
+ - --ffn-hidden-size
49
+ - "2048"
50
+ - --num-attention-heads
51
+ - "16"
52
+ - --num-query-groups
53
+ - "4"
54
+ - --group-query-attention
55
+ - --qk-layernorm
56
+ - --use-flash-attn
57
+ - --max-position-embeddings
58
+ - "8192"
59
+ - --vocab-size
60
+ - "32000"
61
+ - --make-vocab-size-divisible-by
62
+ - "128"
63
+ - --position-embedding-type
64
+ - rope
65
+ - --rotary-base
66
+ - "84000"
67
+ - --rotary-percent
68
+ - "0.5"
69
+ - --rotary-scaling-factor
70
+ - "40"
71
+ - --swiglu
72
+ - --untie-embeddings-and-output-weights
73
+ - --normalization
74
+ - RMSNorm
75
+ - --norm-epsilon
76
+ - "1e-06"
77
+ - --disable-bias-linear
78
+ - --transformer-impl
79
+ - transformer_engine
80
+ - --attention-dropout
81
+ - "0"
82
+ - --hidden-dropout
83
+ - "0"
84
+ - --micro-batch-size
85
+ - "8"
86
+ - --global-batch-size
87
+ - "256"
88
+ - --seq-length
89
+ - "8192"
90
+ - --train-iters
91
+ - "50000"
92
+ - --weight-decay
93
+ - "0.1"
94
+ - --adam-beta1
95
+ - "0.9"
96
+ - --adam-beta2
97
+ - "0.95"
98
+ - --init-method-std
99
+ - "0.02"
100
+ - --clip-grad
101
+ - "1.0"
102
+ - --bf16
103
+ - --optimizer
104
+ - adam
105
+ - --lr
106
+ - "8.0e-4"
107
+ - --lr-decay-style
108
+ - cosine
109
+ - --min-lr
110
+ - "4.00e-5"
111
+ - --lr-warmup-iters
112
+ - "100"
113
+ - --seed
114
+ - "50"
115
+ - --pipeline-model-parallel-size
116
+ - "1"
117
+ - --tensor-model-parallel-size
118
+ - "4"
119
+ - --sequence-parallel
120
+ - --overlap-grad-reduce
121
+ - --data-path
122
+ - szypulka_tokenized_apt4_merged/apt4_merged_text_document
123
+ - --tokenizer-type
124
+ - HuggingFaceTokenizer
125
+ - --tokenizer-model
126
+ - /home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4
127
+ - --split
128
+ - 9999,1,0
129
+ - --dataloader-type
130
+ - single
131
+ - --no-create-attention-mask-in-dataloader
132
+ - --eod-mask-loss
133
+ - --save-interval
134
+ - "1600"
135
+ - --eval-interval
136
+ - "1600"
137
+ - --eval-iters
138
+ - "2"
139
+ - --save
140
+ - poziomka_5
141
+ - --ckpt-format
142
+ - torch_dist
143
+ - --async-save
144
+ - --log-interval
145
+ - "1"
146
+ - --log-throughput
147
+ - --tensorboard-dir
148
+ - poziomka_5/runs
149
+ - --log-timers-to-tensorboard
150
+ - --log-memory-to-tensorboard
151
+ - --log-world-size-to-tensorboard
152
+ - --log-validation-ppl-to-tensorboard
153
+ - --wandb-project
154
+ - poziomka
155
+ - --wandb-exp-name
156
+ - poziomka_5
157
+ - --attention-backend
158
+ - flash
159
+ - --no-masked-softmax-fusion
160
+ - --attention-softmax-in-fp32
161
+ - --cross-entropy-loss-fusion
162
+ - --mtp-num-layers
163
+ - "0"
164
+ codePath: Megatron-LM-core_v0.13.0/pretrain_gpt.py
165
+ codePathLocal: Megatron-LM-core_v0.13.0/pretrain_gpt.py
166
+ cpu_count: 128
167
+ cpu_count_logical: 128
168
+ cudaVersion: "12.8"
169
+ disk:
170
+ /:
171
+ total: "2907329073152"
172
+ used: "390033698816"
173
+ email: adamo1139@gmail.com
174
+ executable: /usr/bin/python3
175
+ git:
176
+ commit: e3867293ebf444f614164c2b84180cd75e7de07c
177
+ remote: https://github.com/adamo1139/Ling-V2.git
178
+ gpu: NVIDIA H100 80GB HBM3
179
+ gpu_count: 8
180
+ gpu_nvidia:
181
+ - architecture: Hopper
182
+ cudaCores: 16896
183
+ memoryTotal: "85520809984"
184
+ name: NVIDIA H100 80GB HBM3
185
+ uuid: GPU-b4eb54a1-d73f-9179-04f2-b231b6a39a34
186
+ - architecture: Hopper
187
+ cudaCores: 16896
188
+ memoryTotal: "85520809984"
189
+ name: NVIDIA H100 80GB HBM3
190
+ uuid: GPU-742b8534-6865-3da0-d864-a822a5d5d629
191
+ - architecture: Hopper
192
+ cudaCores: 16896
193
+ memoryTotal: "85520809984"
194
+ name: NVIDIA H100 80GB HBM3
195
+ uuid: GPU-19d31f38-1be1-eced-5b78-b0d3f4deae56
196
+ - architecture: Hopper
197
+ cudaCores: 16896
198
+ memoryTotal: "85520809984"
199
+ name: NVIDIA H100 80GB HBM3
200
+ uuid: GPU-1b41c967-636a-e3f1-5d74-2da616c06a3e
201
+ - architecture: Hopper
202
+ cudaCores: 16896
203
+ memoryTotal: "85520809984"
204
+ name: NVIDIA H100 80GB HBM3
205
+ uuid: GPU-51d2a37c-3157-7b61-73b3-4a7914884549
206
+ - architecture: Hopper
207
+ cudaCores: 16896
208
+ memoryTotal: "85520809984"
209
+ name: NVIDIA H100 80GB HBM3
210
+ uuid: GPU-499c302d-fc3c-c679-61e7-a2c4d46b8449
211
+ - architecture: Hopper
212
+ cudaCores: 16896
213
+ memoryTotal: "85520809984"
214
+ name: NVIDIA H100 80GB HBM3
215
+ uuid: GPU-8eac440c-b327-20e6-1809-ef3e549d6c6d
216
+ - architecture: Hopper
217
+ cudaCores: 16896
218
+ memoryTotal: "85520809984"
219
+ name: NVIDIA H100 80GB HBM3
220
+ uuid: GPU-5eba09b7-f1fc-555e-d07c-7b6227264759
221
+ host: megatron6
222
+ memory:
223
+ total: "1014522519552"
224
+ os: Linux-5.15.0-143-generic-x86_64-with-glibc2.35
225
+ program: /home/ubuntu/training/Ling-V2/Megatron-LM-core_v0.13.0/pretrain_gpt.py
226
+ python: CPython 3.10.12
227
+ root: poziomka_5/wandb
228
+ startedAt: "2025-09-22T22:04:05.928052Z"
229
+ writerId: 9vte3cwjfuxykvlnatinaorhdm7hrpxl
230
+ m: []
231
+ python_version: 3.10.12
232
+ t:
233
+ "1":
234
+ - 1
235
+ - 11
236
+ - 49
237
+ "2":
238
+ - 1
239
+ - 11
240
+ - 49
241
+ "3":
242
+ - 13
243
+ - 16
244
+ - 61
245
+ "4": 3.10.12
246
+ "5": 0.22.0
247
+ "6": 4.56.2
248
+ "12": 0.22.0
249
+ "13": linux-x86_64
250
+ account_for_embedding_in_pipeline_split:
251
+ value: false
252
+ account_for_loss_in_pipeline_split:
253
+ value: false
254
+ accumulate_allreduce_grads_in_fp32:
255
+ value: true
256
+ adam_beta1:
257
+ value: 0.9
258
+ adam_beta2:
259
+ value: 0.95
260
+ adam_eps:
261
+ value: 1e-08
262
+ add_bias_linear:
263
+ value: false
264
+ add_position_embedding:
265
+ value: true
266
+ add_qkv_bias:
267
+ value: false
268
+ adlr_autoresume:
269
+ value: false
270
+ adlr_autoresume_interval:
271
+ value: 1000
272
+ align_grad_reduce:
273
+ value: true
274
+ align_param_gather:
275
+ value: false
276
+ app_tag_run_name:
277
+ value: null
278
+ app_tag_run_version:
279
+ value: 0.0.0
280
+ apply_layernorm_1p:
281
+ value: false
282
+ apply_query_key_layer_scaling:
283
+ value: false
284
+ apply_residual_connection_post_layernorm:
285
+ value: false
286
+ apply_rope_fusion:
287
+ value: true
288
+ async_save:
289
+ value: true
290
+ async_tensor_model_parallel_allreduce:
291
+ value: true
292
+ attention_backend:
293
+ value: flash
294
+ attention_dropout:
295
+ value: 0
296
+ attention_softmax_in_fp32:
297
+ value: true
298
+ auto_detect_ckpt_format:
299
+ value: false
300
+ barrier_with_L1_time:
301
+ value: true
302
+ bert_binary_head:
303
+ value: true
304
+ bert_embedder_type:
305
+ value: megatron
306
+ bert_load:
307
+ value: null
308
+ bf16:
309
+ value: true
310
+ bias_dropout_fusion:
311
+ value: true
312
+ bias_gelu_fusion:
313
+ value: false
314
+ bias_swiglu_fusion:
315
+ value: true
316
+ bias_zero_mean_update:
317
+ value: true
318
+ biencoder_projection_dim:
319
+ value: 0
320
+ biencoder_shared_query_context_model:
321
+ value: false
322
+ block_data_path:
323
+ value: null
324
+ calc_ft_timeouts:
325
+ value: false
326
+ calculate_per_token_loss:
327
+ value: false
328
+ check_for_large_grads:
329
+ value: false
330
+ check_for_nan_in_loss_and_grad:
331
+ value: true
332
+ check_for_spiky_loss:
333
+ value: false
334
+ check_weight_hash_across_dp_replicas_interval:
335
+ value: null
336
+ ckpt_assume_constant_structure:
337
+ value: false
338
+ ckpt_convert_format:
339
+ value: null
340
+ ckpt_convert_save:
341
+ value: null
342
+ ckpt_convert_update_legacy_dist_opt_format:
343
+ value: false
344
+ ckpt_format:
345
+ value: torch_dist
346
+ ckpt_fully_parallel_load:
347
+ value: false
348
+ ckpt_fully_parallel_save:
349
+ value: true
350
+ ckpt_fully_parallel_save_deprecated:
351
+ value: false
352
+ ckpt_step:
353
+ value: null
354
+ classes_fraction:
355
+ value: 1
356
+ clip_grad:
357
+ value: 1
358
+ clone_scatter_output_in_embedding:
359
+ value: true
360
+ config_logger_dir:
361
+ value: ""
362
+ consumed_train_samples:
363
+ value: 0
364
+ consumed_valid_samples:
365
+ value: 0
366
+ context_parallel_size:
367
+ value: 1
368
+ cp_comm_type:
369
+ value:
370
+ - p2p
371
+ create_attention_mask_in_dataloader:
372
+ value: false
373
+ cross_entropy_fusion_impl:
374
+ value: native
375
+ cross_entropy_loss_fusion:
376
+ value: true
377
+ cuda_graph_scope:
378
+ value: full
379
+ cuda_graph_warmup_steps:
380
+ value: 3
381
+ data_args_path:
382
+ value: null
383
+ data_cache_path:
384
+ value: null
385
+ data_parallel_random_init:
386
+ value: false
387
+ data_parallel_sharding_strategy:
388
+ value: no_shard
389
+ data_parallel_size:
390
+ value: 2
391
+ data_path:
392
+ value:
393
+ - szypulka_tokenized_apt4_merged/apt4_merged_text_document
394
+ data_per_class_fraction:
395
+ value: 1
396
+ data_sharding:
397
+ value: true
398
+ dataloader_type:
399
+ value: single
400
+ ddp_average_in_collective:
401
+ value: false
402
+ ddp_bucket_size:
403
+ value: null
404
+ ddp_num_buckets:
405
+ value: null
406
+ ddp_pad_buckets_for_high_nccl_busbw:
407
+ value: false
408
+ decoder_first_pipeline_num_layers:
409
+ value: null
410
+ decoder_last_pipeline_num_layers:
411
+ value: null
412
+ decoder_num_layers:
413
+ value: null
414
+ decoder_seq_length:
415
+ value: null
416
+ decoupled_lr:
417
+ value: null
418
+ decoupled_min_lr:
419
+ value: null
420
+ decrease_batch_size_if_needed:
421
+ value: false
422
+ defer_embedding_wgrad_compute:
423
+ value: false
424
+ delay_wgrad_compute:
425
+ value: false
426
+ deprecated_use_mcore_models:
427
+ value: false
428
+ deterministic_mode:
429
+ value: false
430
+ dino_bottleneck_size:
431
+ value: 256
432
+ dino_freeze_last_layer:
433
+ value: 1
434
+ dino_head_hidden_size:
435
+ value: 2048
436
+ dino_local_crops_number:
437
+ value: 10
438
+ dino_local_img_size:
439
+ value: 96
440
+ dino_norm_last_layer:
441
+ value: false
442
+ dino_teacher_temp:
443
+ value: 0.07
444
+ dino_warmup_teacher_temp:
445
+ value: 0.04
446
+ dino_warmup_teacher_temp_epochs:
447
+ value: 30
448
+ disable_bf16_reduced_precision_matmul:
449
+ value: false
450
+ disable_mamba_mem_eff_path:
451
+ value: false
452
+ disable_straggler_on_startup:
453
+ value: false
454
+ dist_ckpt_format_deprecated:
455
+ value: null
456
+ dist_ckpt_strictness:
457
+ value: assume_ok_unexpected
458
+ distribute_saved_activations:
459
+ value: false
460
+ distributed_backend:
461
+ value: nccl
462
+ distributed_timeout_minutes:
463
+ value: 10
464
+ embedding_path:
465
+ value: null
466
+ empty_unused_memory_level:
467
+ value: 0
468
+ enable_cuda_graph:
469
+ value: false
470
+ enable_experimental:
471
+ value: false
472
+ enable_ft_package:
473
+ value: false
474
+ enable_gloo_process_groups:
475
+ value: true
476
+ enable_msc:
477
+ value: true
478
+ enable_one_logger:
479
+ value: true
480
+ encoder_num_layers:
481
+ value: 16
482
+ encoder_pipeline_model_parallel_size:
483
+ value: 0
484
+ encoder_seq_length:
485
+ value: 8192
486
+ encoder_tensor_model_parallel_size:
487
+ value: 0
488
+ end_weight_decay:
489
+ value: 0.1
490
+ eod_mask_loss:
491
+ value: true
492
+ error_injection_rate:
493
+ value: 0
494
+ error_injection_type:
495
+ value: transient_error
496
+ eval_interval:
497
+ value: 1600
498
+ eval_iters:
499
+ value: 2
500
+ evidence_data_path:
501
+ value: null
502
+ exit_duration_in_mins:
503
+ value: null
504
+ exit_interval:
505
+ value: null
506
+ exit_on_missing_checkpoint:
507
+ value: false
508
+ exit_signal_handler:
509
+ value: false
510
+ exp_avg_dtype:
511
+ value: torch.float32
512
+ exp_avg_sq_dtype:
513
+ value: torch.float32
514
+ expert_model_parallel_size:
515
+ value: 2
516
+ expert_tensor_parallel_size:
517
+ value: 1
518
+ external_cuda_graph:
519
+ value: false
520
+ ffn_hidden_size:
521
+ value: 2048
522
+ finetune:
523
+ value: false
524
+ first_last_layers_bf16:
525
+ value: false
526
+ flash_decode:
527
+ value: false
528
+ fp8:
529
+ value: null
530
+ fp8_amax_compute_algo:
531
+ value: most_recent
532
+ fp8_amax_history_len:
533
+ value: 1
534
+ fp8_interval:
535
+ value: 1
536
+ fp8_margin:
537
+ value: 0
538
+ fp8_param_gather:
539
+ value: false
540
+ fp8_recipe:
541
+ value: delayed
542
+ fp8_wgrad:
543
+ value: true
544
+ fp16:
545
+ value: false
546
+ fp16_lm_cross_entropy:
547
+ value: false
548
+ fp32_residual_connection:
549
+ value: false
550
+ fsdp_double_buffer:
551
+ value: false
552
+ global_batch_size:
553
+ value: 256
554
+ grad_reduce_in_bf16:
555
+ value: false
556
+ gradient_accumulation_fusion:
557
+ value: true
558
+ gradient_reduce_div_fusion:
559
+ value: true
560
+ group_query_attention:
561
+ value: true
562
+ head_lr_mult:
563
+ value: 1
564
+ heterogeneous_layers_config_encoded_json:
565
+ value: null
566
+ heterogeneous_layers_config_path:
567
+ value: null
568
+ hidden_dropout:
569
+ value: 0
570
+ hidden_size:
571
+ value: 2048
572
+ hierarchical_context_parallel_sizes:
573
+ value: null
574
+ high_priority_stream_groups:
575
+ value: []
576
+ hybrid_attention_ratio:
577
+ value: 0
578
+ hybrid_mlp_ratio:
579
+ value: 0
580
+ hybrid_override_pattern:
581
+ value: null
582
+ hysteresis:
583
+ value: 2
584
+ ict_head_size:
585
+ value: null
586
+ ict_load:
587
+ value: null
588
+ img_h:
589
+ value: 224
590
+ img_w:
591
+ value: 224
592
+ indexer_batch_size:
593
+ value: 128
594
+ indexer_log_interval:
595
+ value: 1000
596
+ inference_batch_times_seqlen_threshold:
597
+ value: -1
598
+ inference_dynamic_batching:
599
+ value: false
600
+ inference_dynamic_batching_buffer_guaranteed_fraction:
601
+ value: 0.2
602
+ inference_dynamic_batching_buffer_overflow_factor:
603
+ value: null
604
+ inference_dynamic_batching_buffer_size_gb:
605
+ value: 40
606
+ inference_dynamic_batching_chunk_size:
607
+ value: 256
608
+ inference_dynamic_batching_max_requests_override:
609
+ value: null
610
+ inference_dynamic_batching_max_tokens_override:
611
+ value: null
612
+ inference_max_batch_size:
613
+ value: 8
614
+ inference_max_seq_length:
615
+ value: 2560
616
+ inference_rng_tracker:
617
+ value: false
618
+ init_method_std:
619
+ value: 0.02
620
+ init_method_xavier_uniform:
621
+ value: false
622
+ init_model_with_meta_device:
623
+ value: false
624
+ initial_loss_scale:
625
+ value: 4294967296
626
+ inprocess_active_world_size:
627
+ value: 8
628
+ inprocess_barrier_timeout:
629
+ value: 120
630
+ inprocess_completion_timeout:
631
+ value: 120
632
+ inprocess_empty_cuda_cache:
633
+ value: false
634
+ inprocess_granularity:
635
+ value: node
636
+ inprocess_hard_timeout:
637
+ value: 90
638
+ inprocess_heartbeat_interval:
639
+ value: 30
640
+ inprocess_heartbeat_timeout:
641
+ value: 60
642
+ inprocess_last_call_wait:
643
+ value: 1
644
+ inprocess_max_iterations:
645
+ value: null
646
+ inprocess_monitor_process_interval:
647
+ value: 1
648
+ inprocess_monitor_thread_interval:
649
+ value: 1
650
+ inprocess_progress_watchdog_interval:
651
+ value: 1
652
+ inprocess_restart:
653
+ value: false
654
+ inprocess_soft_timeout:
655
+ value: 60
656
+ inprocess_termination_grace_time:
657
+ value: 1
658
+ is_hybrid_model:
659
+ value: false
660
+ iter_per_epoch:
661
+ value: 1250
662
+ iterations_to_skip:
663
+ value: []
664
+ keep_fp8_transpose_cache_when_using_custom_fsdp:
665
+ value: false
666
+ kitchen_config_file:
667
+ value: null
668
+ kitchen_recipe_number:
669
+ value: null
670
+ kv_channels:
671
+ value: 128
672
+ kv_lora_rank:
673
+ value: 32
674
+ lazy_mpu_init:
675
+ value: null
676
+ load:
677
+ value: null
678
+ load_model_opt_format:
679
+ value: false
680
+ local_rank:
681
+ value: 7
682
+ log_energy:
683
+ value: false
684
+ log_interval:
685
+ value: 1
686
+ log_loss_scale_to_tensorboard:
687
+ value: true
688
+ log_memory_to_tensorboard:
689
+ value: true
690
+ log_num_zeros_in_grad:
691
+ value: false
692
+ log_params_norm:
693
+ value: false
694
+ log_progress:
695
+ value: false
696
+ log_straggler:
697
+ value: false
698
+ log_throughput:
699
+ value: true
700
+ log_timers_to_tensorboard:
701
+ value: true
702
+ log_validation_ppl_to_tensorboard:
703
+ value: true
704
+ log_world_size_to_tensorboard:
705
+ value: true
706
+ logging_level:
707
+ value: null
708
+ loss_scale:
709
+ value: null
710
+ loss_scale_window:
711
+ value: 1000
712
+ lr:
713
+ value: 0.0008
714
+ lr_decay_iters:
715
+ value: null
716
+ lr_decay_samples:
717
+ value: null
718
+ lr_decay_style:
719
+ value: cosine
720
+ lr_warmup_fraction:
721
+ value: null
722
+ lr_warmup_init:
723
+ value: 0
724
+ lr_warmup_iters:
725
+ value: 100
726
+ lr_warmup_samples:
727
+ value: 0
728
+ lr_wsd_decay_iters:
729
+ value: null
730
+ lr_wsd_decay_samples:
731
+ value: null
732
+ lr_wsd_decay_style:
733
+ value: exponential
734
+ main_grads_dtype:
735
+ value: torch.float32
736
+ main_params_dtype:
737
+ value: torch.float32
738
+ make_vocab_size_divisible_by:
739
+ value: 128
740
+ mamba_head_dim:
741
+ value: 64
742
+ mamba_num_groups:
743
+ value: 8
744
+ mamba_num_heads:
745
+ value: null
746
+ mamba_state_dim:
747
+ value: 128
748
+ manual_gc:
749
+ value: false
750
+ manual_gc_eval:
751
+ value: true
752
+ manual_gc_interval:
753
+ value: 0
754
+ mask_factor:
755
+ value: 1
756
+ mask_prob:
757
+ value: 0.15
758
+ mask_type:
759
+ value: random
760
+ masked_softmax_fusion:
761
+ value: false
762
+ max_position_embeddings:
763
+ value: 8192
764
+ max_tokens_to_oom:
765
+ value: 12000
766
+ memory_snapshot_path:
767
+ value: snapshot.pickle
768
+ merge_file:
769
+ value: null
770
+ micro_batch_size:
771
+ value: 8
772
+ microbatch_group_size_per_vp_stage:
773
+ value: null
774
+ mid_level_dataset_surplus:
775
+ value: 0.005
776
+ min_loss_scale:
777
+ value: 1
778
+ min_lr:
779
+ value: 4e-05
780
+ mlp_chunks_for_prefill:
781
+ value: 1
782
+ mmap_bin_files:
783
+ value: true
784
+ mock_data:
785
+ value: false
786
+ moe_apply_probs_on_input:
787
+ value: false
788
+ moe_aux_loss_coeff:
789
+ value: 0
790
+ moe_deepep_num_sms:
791
+ value: 20
792
+ moe_enable_deepep:
793
+ value: false
794
+ moe_expert_capacity_factor:
795
+ value: 1.25
796
+ moe_extended_tp:
797
+ value: false
798
+ moe_ffn_hidden_size:
799
+ value: 320
800
+ moe_grouped_gemm:
801
+ value: true
802
+ moe_input_jitter_eps:
803
+ value: null
804
+ moe_layer_freq:
805
+ value:
806
+ - 0
807
+ - 1
808
+ - 1
809
+ - 1
810
+ - 1
811
+ - 1
812
+ - 1
813
+ - 1
814
+ - 1
815
+ - 1
816
+ - 1
817
+ - 1
818
+ - 1
819
+ - 1
820
+ - 1
821
+ - 1
822
+ moe_layer_recompute:
823
+ value: false
824
+ moe_pad_expert_input_to_capacity:
825
+ value: true
826
+ moe_per_layer_logging:
827
+ value: false
828
+ moe_permute_fusion:
829
+ value: false
830
+ moe_router_bias_update_rate:
831
+ value: 0.001
832
+ moe_router_dtype:
833
+ value: fp32
834
+ moe_router_enable_expert_bias:
835
+ value: true
836
+ moe_router_force_load_balancing:
837
+ value: false
838
+ moe_router_group_topk:
839
+ value: 2
840
+ moe_router_load_balancing_type:
841
+ value: aux_loss
842
+ moe_router_num_groups:
843
+ value: 8
844
+ moe_router_padding_for_fp8:
845
+ value: false
846
+ moe_router_pre_softmax:
847
+ value: false
848
+ moe_router_score_function:
849
+ value: sigmoid
850
+ moe_router_topk:
851
+ value: 4
852
+ moe_router_topk_scaling_factor:
853
+ value: 2.5
854
+ moe_shared_expert_intermediate_size:
855
+ value: 320
856
+ moe_shared_expert_overlap:
857
+ value: true
858
+ moe_token_dispatcher_type:
859
+ value: alltoall
860
+ moe_token_drop_policy:
861
+ value: probs
862
+ moe_upcycling_granularity:
863
+ value: 1
864
+ moe_use_legacy_grouped_gemm:
865
+ value: false
866
+ moe_use_upcycling:
867
+ value: false
868
+ moe_z_loss_coeff:
869
+ value: 3.5e-06
870
+ mrope_section:
871
+ value: null
872
+ mscale:
873
+ value: 1
874
+ mscale_all_dim:
875
+ value: 1
876
+ mtp_loss_scaling_factor:
877
+ value: 0.1
878
+ mtp_num_layers:
879
+ value: 0
880
+ multi_latent_attention:
881
+ value: false
882
+ nccl_all_reduce_for_prefill:
883
+ value: false
884
+ nccl_communicator_config_path:
885
+ value: null
886
+ nccl_ub:
887
+ value: false
888
+ no_load_optim:
889
+ value: null
890
+ no_load_rng:
891
+ value: null
892
+ no_persist_layer_norm:
893
+ value: false
894
+ no_rope_freq:
895
+ value: null
896
+ no_save_optim:
897
+ value: null
898
+ no_save_rng:
899
+ value: null
900
+ non_persistent_ckpt_type:
901
+ value: null
902
+ non_persistent_global_ckpt_dir:
903
+ value: null
904
+ non_persistent_local_ckpt_algo:
905
+ value: fully_parallel
906
+ non_persistent_local_ckpt_dir:
907
+ value: null
908
+ non_persistent_save_interval:
909
+ value: null
910
+ norm_epsilon:
911
+ value: 1e-06
912
+ normalization:
913
+ value: RMSNorm
914
+ num_attention_heads:
915
+ value: 16
916
+ num_channels:
917
+ value: 3
918
+ num_classes:
919
+ value: 1000
920
+ num_dataset_builder_threads:
921
+ value: 1
922
+ num_distributed_optimizer_instances:
923
+ value: 1
924
+ num_experts:
925
+ value: 128
926
+ num_layers:
927
+ value: 16
928
+ num_layers_at_end_in_bf16:
929
+ value: 1
930
+ num_layers_at_start_in_bf16:
931
+ value: 1
932
+ num_layers_per_virtual_pipeline_stage:
933
+ value: null
934
+ num_query_groups:
935
+ value: 4
936
+ num_virtual_stages_per_pipeline_rank:
937
+ value: null
938
+ num_workers:
939
+ value: 2
940
+ object_storage_cache_path:
941
+ value: null
942
+ one_logger_async:
943
+ value: false
944
+ one_logger_project:
945
+ value: megatron-lm
946
+ one_logger_run_name:
947
+ value: null
948
+ onnx_safe:
949
+ value: null
950
+ openai_gelu:
951
+ value: false
952
+ optimizer:
953
+ value: adam
954
+ optimizer_cpu_offload:
955
+ value: false
956
+ optimizer_offload_fraction:
957
+ value: 1
958
+ output_bert_embeddings:
959
+ value: false
960
+ overlap_cpu_optimizer_d2h_h2d:
961
+ value: false
962
+ overlap_grad_reduce:
963
+ value: true
964
+ overlap_p2p_comm:
965
+ value: false
966
+ overlap_p2p_comm_warmup_flush:
967
+ value: false
968
+ overlap_param_gather:
969
+ value: false
970
+ overlap_param_gather_with_optimizer_step:
971
+ value: false
972
+ override_opt_param_scheduler:
973
+ value: false
974
+ padded_vocab_size:
975
+ value: 32256
976
+ params_dtype:
977
+ value: torch.bfloat16
978
+ patch_dim:
979
+ value: 16
980
+ per_split_data_args_path:
981
+ value: null
982
+ perform_initialization:
983
+ value: true
984
+ pin_cpu_grads:
985
+ value: true
986
+ pin_cpu_params:
987
+ value: true
988
+ pipeline_model_parallel_comm_backend:
989
+ value: null
990
+ pipeline_model_parallel_layout:
991
+ value: null
992
+ pipeline_model_parallel_size:
993
+ value: 1
994
+ pipeline_model_parallel_split_rank:
995
+ value: null
996
+ position_embedding_type:
997
+ value: rope
998
+ pretrained_checkpoint:
999
+ value: null
1000
+ profile:
1001
+ value: false
1002
+ profile_ranks:
1003
+ value:
1004
+ - 0
1005
+ profile_step_end:
1006
+ value: 12
1007
+ profile_step_start:
1008
+ value: 10
1009
+ q_lora_rank:
1010
+ value: null
1011
+ qk_head_dim:
1012
+ value: 128
1013
+ qk_l2_norm:
1014
+ value: false
1015
+ qk_layernorm:
1016
+ value: true
1017
+ qk_pos_emb_head_dim:
1018
+ value: 64
1019
+ query_in_block_prob:
1020
+ value: 0.1
1021
+ rampup_batch_size:
1022
+ value: null
1023
+ rank:
1024
+ value: 7
1025
+ recompute_granularity:
1026
+ value: null
1027
+ recompute_method:
1028
+ value: null
1029
+ recompute_modules:
1030
+ value: null
1031
+ recompute_num_layers:
1032
+ value: null
1033
+ record_memory_history:
1034
+ value: false
1035
+ relative_attention_max_distance:
1036
+ value: 128
1037
+ relative_attention_num_buckets:
1038
+ value: 32
1039
+ replication:
1040
+ value: false
1041
+ replication_factor:
1042
+ value: 2
1043
+ replication_jump:
1044
+ value: null
1045
+ rerun_mode:
1046
+ value: disabled
1047
+ reset_attention_mask:
1048
+ value: false
1049
+ reset_position_ids:
1050
+ value: false
1051
+ result_rejected_tracker_filename:
1052
+ value: null
1053
+ retriever_report_topk_accuracies:
1054
+ value: []
1055
+ retriever_score_scaling:
1056
+ value: false
1057
+ retriever_seq_length:
1058
+ value: 256
1059
+ retro_add_retriever:
1060
+ value: false
1061
+ retro_attention_gate:
1062
+ value: 1
1063
+ retro_cyclic_train_iters:
1064
+ value: null
1065
+ retro_encoder_attention_dropout:
1066
+ value: 0.1
1067
+ retro_encoder_hidden_dropout:
1068
+ value: 0.1
1069
+ retro_encoder_layers:
1070
+ value: 2
1071
+ retro_num_neighbors:
1072
+ value: 2
1073
+ retro_num_retrieved_chunks:
1074
+ value: 2
1075
+ retro_project_dir:
1076
+ value: null
1077
+ retro_verify_neighbor_count:
1078
+ value: true
1079
+ reuse_grad_buf_for_mxfp8_param_ag:
1080
+ value: false
1081
+ rope_scaling_factor:
1082
+ value: 8
1083
+ rotary_base:
1084
+ value: 84000
1085
+ rotary_interleaved:
1086
+ value: false
1087
+ rotary_percent:
1088
+ value: 0.5
1089
+ rotary_scaling_factor:
1090
+ value: 40
1091
+ rotary_seq_len_interpolation_factor:
1092
+ value: null
1093
+ run_workload_inspector_server:
1094
+ value: false
1095
+ sample_rate:
1096
+ value: 1
1097
+ save:
1098
+ value: poziomka_5
1099
+ save_interval:
1100
+ value: 1600
1101
+ scatter_gather_tensors_in_pipeline:
1102
+ value: true
1103
+ seed:
1104
+ value: 50
1105
+ seq_length:
1106
+ value: 8192
1107
+ sequence_parallel:
1108
+ value: true
1109
+ sft:
1110
+ value: false
1111
+ sft_tokenizer_prompt_format:
1112
+ value: nemotron-h-aligned
1113
+ sgd_momentum:
1114
+ value: 0.9
1115
+ short_seq_prob:
1116
+ value: 0.1
1117
+ skip_casting_dtype_for_param_pattern:
1118
+ value: '["^expert_bias$|.+\.expert_bias$"]'
1119
+ skip_train:
1120
+ value: false
1121
+ skipped_train_samples:
1122
+ value: 0
1123
+ spec:
1124
+ value: null
1125
+ split:
1126
+ value: 9999,1,0
1127
+ squared_relu:
1128
+ value: false
1129
+ start_weight_decay:
1130
+ value: 0.1
1131
+ straggler_ctrlr_port:
1132
+ value: 65535
1133
+ straggler_minmax_count:
1134
+ value: 1
1135
+ suggested_communication_unit_size:
1136
+ value: null
1137
+ swiglu:
1138
+ value: true
1139
+ swin_backbone_type:
1140
+ value: tiny
1141
+ symmetric_ar_type:
1142
+ value: null
1143
+ te_rng_tracker:
1144
+ value: false
1145
+ tensor_model_parallel_size:
1146
+ value: 4
1147
+ tensorboard_dir:
1148
+ value: poziomka_5/runs
1149
+ tensorboard_log_interval:
1150
+ value: 1
1151
+ tensorboard_queue_size:
1152
+ value: 1000
1153
+ test_data_path:
1154
+ value: null
1155
+ test_mode:
1156
+ value: false
1157
+ tiktoken_num_special_tokens:
1158
+ value: 1000
1159
+ tiktoken_pattern:
1160
+ value: null
1161
+ tiktoken_special_tokens:
1162
+ value: null
1163
+ timing_log_level:
1164
+ value: 0
1165
+ timing_log_option:
1166
+ value: minmax
1167
+ titles_data_path:
1168
+ value: null
1169
+ tokenizer_model:
1170
+ value: /home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4
1171
+ tokenizer_type:
1172
+ value: HuggingFaceTokenizer
1173
+ torch_fsdp2_reshard_after_forward:
1174
+ value: true
1175
+ tp_comm_bootstrap_backend:
1176
+ value: nccl
1177
+ tp_comm_bulk_dgrad:
1178
+ value: true
1179
+ tp_comm_bulk_wgrad:
1180
+ value: true
1181
+ tp_comm_overlap:
1182
+ value: false
1183
+ tp_comm_overlap_ag:
1184
+ value: true
1185
+ tp_comm_overlap_cfg:
1186
+ value: null
1187
+ tp_comm_overlap_rs:
1188
+ value: true
1189
+ tp_comm_overlap_rs_dgrad:
1190
+ value: false
1191
+ tp_comm_split_ag:
1192
+ value: true
1193
+ tp_comm_split_rs:
1194
+ value: true
1195
+ train_data_path:
1196
+ value: null
1197
+ train_iters:
1198
+ value: 50000
1199
+ train_samples:
1200
+ value: null
1201
+ train_sync_interval:
1202
+ value: null
1203
+ transformer_impl:
1204
+ value: transformer_engine
1205
+ transformer_pipeline_model_parallel_size:
1206
+ value: 1
1207
+ untie_embeddings_and_output_weights:
1208
+ value: true
1209
+ use_checkpoint_args:
1210
+ value: false
1211
+ use_checkpoint_opt_param_scheduler:
1212
+ value: false
1213
+ use_cpu_initialization:
1214
+ value: null
1215
+ use_custom_fsdp:
1216
+ value: false
1217
+ use_dist_ckpt:
1218
+ value: true
1219
+ use_dist_ckpt_deprecated:
1220
+ value: false
1221
+ use_distributed_optimizer:
1222
+ value: false
1223
+ use_flash_attn:
1224
+ value: true
1225
+ use_legacy_models:
1226
+ value: false
1227
+ use_mp_args_from_checkpoint_args:
1228
+ value: false
1229
+ use_one_sent_docs:
1230
+ value: false
1231
+ use_persistent_ckpt_worker:
1232
+ value: false
1233
+ use_precision_aware_optimizer:
1234
+ value: false
1235
+ use_pytorch_profiler:
1236
+ value: false
1237
+ use_ring_exchange_p2p:
1238
+ value: false
1239
+ use_rope_scaling:
1240
+ value: false
1241
+ use_rotary_position_embeddings:
1242
+ value: false
1243
+ use_sharp:
1244
+ value: false
1245
+ use_tokenizer_model_from_checkpoint_args:
1246
+ value: true
1247
+ use_torch_fsdp2:
1248
+ value: false
1249
+ use_torch_optimizer_for_cpu_offload:
1250
+ value: false
1251
+ use_tp_pp_dp_mapping:
1252
+ value: false
1253
+ v_head_dim:
1254
+ value: 128
1255
+ valid_data_path:
1256
+ value: null
1257
+ variable_seq_lengths:
1258
+ value: false
1259
+ virtual_pipeline_model_parallel_size:
1260
+ value: null
1261
+ vision_backbone_type:
1262
+ value: vit
1263
+ vision_pretraining:
1264
+ value: false
1265
+ vision_pretraining_type:
1266
+ value: classify
1267
+ vocab_extra_ids:
1268
+ value: 0
1269
+ vocab_file:
1270
+ value: null
1271
+ vocab_size:
1272
+ value: 32000
1273
+ wandb_exp_name:
1274
+ value: poziomka_5
1275
+ wandb_project:
1276
+ value: poziomka
1277
+ wandb_save_dir:
1278
+ value: ""
1279
+ weight_decay:
1280
+ value: 0.1
1281
+ weight_decay_incr_style:
1282
+ value: constant
1283
+ wgrad_deferral_limit:
1284
+ value: 0
1285
+ world_size:
1286
+ value: 8
1287
+ yaml_cfg:
1288
+ value: null
wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:184173ebbe892d0082a25f8f9c003dff5ca6df791cee3e7dfa216517dc2a9dbd
3
+ size 15656694
wandb/wandb/run-20250922_220405-hrldy3bw/files/requirements.txt ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GitPython==3.1.45
2
+ psutils==3.3.11
3
+ networkx==3.3
4
+ dill==0.4.0
5
+ requests==2.32.5
6
+ nvidia-cusparselt-cu12==0.7.1
7
+ ml_dtypes==0.5.3
8
+ pyarrow==21.0.0
9
+ gitdb==4.0.12
10
+ packaging==24.2
11
+ pydantic_core==2.33.2
12
+ torchvision==0.23.0+cu129
13
+ mpmath==1.3.0
14
+ nvidia-cusolver-cu12==11.7.5.82
15
+ nvidia-cuda-runtime-cu12==12.9.79
16
+ propcache==0.3.2
17
+ psutil==7.1.0
18
+ onnx-ir==0.1.9
19
+ nvidia-cusparse-cu12==12.5.10.65
20
+ aiohttp==3.12.15
21
+ aiosignal==1.4.0
22
+ protobuf==6.32.1
23
+ apex==0.1
24
+ torch==2.8.0+cu129
25
+ nvidia-cublas-cu12==12.9.1.4
26
+ frozenlist==1.7.0
27
+ nvidia-cufile-cu12==1.14.1.1
28
+ onnxscript==0.3.1
29
+ smmap==5.0.2
30
+ nvidia-cuda-nvrtc-cu12==12.9.86
31
+ pandas==2.3.2
32
+ platformdirs==4.4.0
33
+ nvidia-nvjitlink-cu12==12.9.86
34
+ pypdf==6.1.0
35
+ puremagic==1.30
36
+ regex==2025.9.18
37
+ triton==3.4.0
38
+ pip==25.2
39
+ pydantic==2.11.9
40
+ charset-normalizer==3.4.3
41
+ nvidia-cufft-cu12==11.4.1.4
42
+ urllib3==2.5.0
43
+ nvidia-cudnn-cu12==9.10.2.21
44
+ tzdata==2025.2
45
+ wandb==0.22.0
46
+ datasets==4.1.1
47
+ huggingface-hub==0.35.0
48
+ transformers==4.56.2
49
+ tqdm==4.67.1
50
+ megatron-core==0.13.0
51
+ tiktoken==0.11.0
52
+ hf_transfer==0.1.9
53
+ multiprocess==0.70.16
54
+ python-dateutil==2.9.0.post0
55
+ multidict==6.6.4
56
+ sentry-sdk==2.38.0
57
+ aiohappyeyeballs==2.6.1
58
+ onnx==1.19.0
59
+ einops==0.8.1
60
+ sympy==1.13.3
61
+ setuptools==80.9.0
62
+ pillow==11.0.0
63
+ filelock==3.19.1
64
+ hf-xet==1.1.10
65
+ flash_attn_3==3.0.0b1
66
+ ninja==1.13.0
67
+ fsspec==2025.9.0
68
+ nvidia-curand-cu12==10.3.10.19
69
+ bitsandbytes==0.47.0
70
+ nvidia-nccl-cu12==2.27.3
71
+ typing-inspection==0.4.1
72
+ xxhash==3.5.0
73
+ numpy==1.26.4
74
+ tokenizers==0.22.1
75
+ typing_extensions==4.15.0
76
+ safetensors==0.6.2
77
+ annotated-types==0.7.0
78
+ transformer_engine==2.6.0.post1
79
+ nvidia-nvtx-cu12==12.9.79
80
+ async-timeout==5.0.1
81
+ transformer_engine_cu12==2.6.0.post1
82
+ transformer_engine_torch==2.6.0.post1
83
+ nvidia-cuda-cupti-cu12==12.9.79
84
+ wheel==0.45.1
85
+ yarl==1.20.1
86
+ pybind11==3.0.1
87
+ python-debian==0.1.43+ubuntu1.1
88
+ SecretStorage==3.3.1
89
+ lazr.restfulclient==0.14.4
90
+ pytz==2022.1
91
+ attrs==21.2.0
92
+ zope.interface==5.4.0
93
+ chardet==4.0.0
94
+ pyasn1-modules==0.2.1
95
+ setuptools==59.6.0
96
+ Jinja2==3.0.3
97
+ pyasn1==0.4.8
98
+ netifaces==0.11.0
99
+ ubuntu-drivers-common==0.0.0
100
+ click==8.0.3
101
+ dbus-python==1.2.18
102
+ pyserial==3.5
103
+ python-apt==2.4.0+ubuntu4
104
+ PyJWT==2.3.0
105
+ oauthlib==3.2.0
106
+ bcrypt==3.2.0
107
+ python-magic==0.4.24
108
+ xkit==0.0.0
109
+ constantly==15.1.0
110
+ blinker==1.4
111
+ PyYAML==5.4.1
112
+ distro-info==1.1+ubuntu0.2
113
+ lazr.uri==1.0.6
114
+ distro==1.7.0
115
+ pexpect==4.8.0
116
+ PyGObject==3.42.1
117
+ ssh-import-id==5.11
118
+ cryptography==3.4.8
119
+ certifi==2020.6.20
120
+ service-identity==18.1.0
121
+ cloud-init==25.1.2
122
+ keyring==23.5.0
123
+ jeepney==0.7.1
124
+ colorama==0.4.4
125
+ idna==3.3
126
+ MarkupSafe==2.0.1
127
+ pip==22.0.2
128
+ ptyprocess==0.7.0
129
+ configobj==5.0.6
130
+ hyperlink==21.0.0
131
+ pyparsing==2.4.7
132
+ ufw==0.36.1
133
+ pyrsistent==0.18.1
134
+ httplib2==0.20.2
135
+ sos==4.8.2
136
+ unattended-upgrades==0.1
137
+ requests==2.25.1
138
+ ubuntu-pro-client==8001
139
+ launchpadlib==1.10.16
140
+ six==1.16.0
141
+ urllib3==1.26.5
142
+ systemd-python==234
143
+ importlib-metadata==4.6.4
144
+ command-not-found==0.3
145
+ jsonschema==3.2.0
146
+ Automat==20.2.0
147
+ more-itertools==8.10.0
148
+ PyHamcrest==2.0.2
149
+ incremental==21.3.0
150
+ zipp==1.0.0
151
+ jsonpointer==2.0
152
+ Twisted==22.1.0
153
+ pyOpenSSL==21.0.0
154
+ wadllib==1.3.6
155
+ Babel==2.8.0
156
+ jsonpatch==1.32
157
+ wheel==0.37.1
158
+ platformdirs==4.2.2
159
+ typing_extensions==4.12.2
160
+ packaging==24.2
161
+ tomli==2.0.1
162
+ inflect==7.3.1
163
+ jaraco.context==5.3.0
164
+ backports.tarfile==1.2.0
165
+ autocommand==2.2.2
166
+ importlib_metadata==8.0.0
167
+ more-itertools==10.3.0
168
+ jaraco.functools==4.0.1
169
+ typeguard==4.3.0
170
+ zipp==3.19.2
171
+ jaraco.collections==5.1.0
172
+ jaraco.text==3.12.1
173
+ wheel==0.45.1
wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-metadata.json ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.15.0-143-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.12",
4
+ "startedAt": "2025-09-22T22:04:05.928052Z",
5
+ "args": [
6
+ "--expert-model-parallel-size",
7
+ "2",
8
+ "--expert-tensor-parallel-size",
9
+ "1",
10
+ "--moe-grouped-gemm",
11
+ "--moe-token-dispatcher-type",
12
+ "alltoall",
13
+ "--moe-router-dtype",
14
+ "fp32",
15
+ "--num-experts",
16
+ "128",
17
+ "--moe-ffn-hidden-size",
18
+ "320",
19
+ "--moe-shared-expert-intermediate-size",
20
+ "320",
21
+ "--moe-router-score-function",
22
+ "sigmoid",
23
+ "--moe-router-topk",
24
+ "4",
25
+ "--moe-router-enable-expert-bias",
26
+ "--moe-router-topk-scaling-factor",
27
+ "2.5",
28
+ "--moe-router-num-groups",
29
+ "8",
30
+ "--moe-router-group-topk",
31
+ "2",
32
+ "--moe-z-loss-coeff",
33
+ "0.0000035",
34
+ "--moe-router-bias-update-rate",
35
+ "1e-3",
36
+ "--moe-layer-freq",
37
+ "[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]",
38
+ "--bias-zero-mean-update",
39
+ "--moe-expert-capacity-factor",
40
+ "1.25",
41
+ "--moe-pad-expert-input-to-capacity",
42
+ "--moe-shared-expert-overlap",
43
+ "--num-layers",
44
+ "16",
45
+ "--hidden-size",
46
+ "2048",
47
+ "--ffn-hidden-size",
48
+ "2048",
49
+ "--num-attention-heads",
50
+ "16",
51
+ "--num-query-groups",
52
+ "4",
53
+ "--group-query-attention",
54
+ "--qk-layernorm",
55
+ "--use-flash-attn",
56
+ "--max-position-embeddings",
57
+ "8192",
58
+ "--vocab-size",
59
+ "32000",
60
+ "--make-vocab-size-divisible-by",
61
+ "128",
62
+ "--position-embedding-type",
63
+ "rope",
64
+ "--rotary-base",
65
+ "84000",
66
+ "--rotary-percent",
67
+ "0.5",
68
+ "--rotary-scaling-factor",
69
+ "40",
70
+ "--swiglu",
71
+ "--untie-embeddings-and-output-weights",
72
+ "--normalization",
73
+ "RMSNorm",
74
+ "--norm-epsilon",
75
+ "1e-06",
76
+ "--disable-bias-linear",
77
+ "--transformer-impl",
78
+ "transformer_engine",
79
+ "--attention-dropout",
80
+ "0",
81
+ "--hidden-dropout",
82
+ "0",
83
+ "--micro-batch-size",
84
+ "8",
85
+ "--global-batch-size",
86
+ "256",
87
+ "--seq-length",
88
+ "8192",
89
+ "--train-iters",
90
+ "50000",
91
+ "--weight-decay",
92
+ "0.1",
93
+ "--adam-beta1",
94
+ "0.9",
95
+ "--adam-beta2",
96
+ "0.95",
97
+ "--init-method-std",
98
+ "0.02",
99
+ "--clip-grad",
100
+ "1.0",
101
+ "--bf16",
102
+ "--optimizer",
103
+ "adam",
104
+ "--lr",
105
+ "8.0e-4",
106
+ "--lr-decay-style",
107
+ "cosine",
108
+ "--min-lr",
109
+ "4.00e-5",
110
+ "--lr-warmup-iters",
111
+ "100",
112
+ "--seed",
113
+ "50",
114
+ "--pipeline-model-parallel-size",
115
+ "1",
116
+ "--tensor-model-parallel-size",
117
+ "4",
118
+ "--sequence-parallel",
119
+ "--overlap-grad-reduce",
120
+ "--data-path",
121
+ "szypulka_tokenized_apt4_merged/apt4_merged_text_document",
122
+ "--tokenizer-type",
123
+ "HuggingFaceTokenizer",
124
+ "--tokenizer-model",
125
+ "/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4",
126
+ "--split",
127
+ "9999,1,0",
128
+ "--dataloader-type",
129
+ "single",
130
+ "--no-create-attention-mask-in-dataloader",
131
+ "--eod-mask-loss",
132
+ "--save-interval",
133
+ "1600",
134
+ "--eval-interval",
135
+ "1600",
136
+ "--eval-iters",
137
+ "2",
138
+ "--save",
139
+ "poziomka_5",
140
+ "--ckpt-format",
141
+ "torch_dist",
142
+ "--async-save",
143
+ "--log-interval",
144
+ "1",
145
+ "--log-throughput",
146
+ "--tensorboard-dir",
147
+ "poziomka_5/runs",
148
+ "--log-timers-to-tensorboard",
149
+ "--log-memory-to-tensorboard",
150
+ "--log-world-size-to-tensorboard",
151
+ "--log-validation-ppl-to-tensorboard",
152
+ "--wandb-project",
153
+ "poziomka",
154
+ "--wandb-exp-name",
155
+ "poziomka_5",
156
+ "--attention-backend",
157
+ "flash",
158
+ "--no-masked-softmax-fusion",
159
+ "--attention-softmax-in-fp32",
160
+ "--cross-entropy-loss-fusion",
161
+ "--mtp-num-layers",
162
+ "0"
163
+ ],
164
+ "program": "/home/ubuntu/training/Ling-V2/Megatron-LM-core_v0.13.0/pretrain_gpt.py",
165
+ "codePath": "Megatron-LM-core_v0.13.0/pretrain_gpt.py",
166
+ "codePathLocal": "Megatron-LM-core_v0.13.0/pretrain_gpt.py",
167
+ "git": {
168
+ "remote": "https://github.com/adamo1139/Ling-V2.git",
169
+ "commit": "e3867293ebf444f614164c2b84180cd75e7de07c"
170
+ },
171
+ "email": "adamo1139@gmail.com",
172
+ "root": "poziomka_5/wandb",
173
+ "host": "megatron6",
174
+ "executable": "/usr/bin/python3",
175
+ "cpu_count": 128,
176
+ "cpu_count_logical": 128,
177
+ "gpu": "NVIDIA H100 80GB HBM3",
178
+ "gpu_count": 8,
179
+ "disk": {
180
+ "/": {
181
+ "total": "2907329073152",
182
+ "used": "390033698816"
183
+ }
184
+ },
185
+ "memory": {
186
+ "total": "1014522519552"
187
+ },
188
+ "gpu_nvidia": [
189
+ {
190
+ "name": "NVIDIA H100 80GB HBM3",
191
+ "memoryTotal": "85520809984",
192
+ "cudaCores": 16896,
193
+ "architecture": "Hopper",
194
+ "uuid": "GPU-b4eb54a1-d73f-9179-04f2-b231b6a39a34"
195
+ },
196
+ {
197
+ "name": "NVIDIA H100 80GB HBM3",
198
+ "memoryTotal": "85520809984",
199
+ "cudaCores": 16896,
200
+ "architecture": "Hopper",
201
+ "uuid": "GPU-742b8534-6865-3da0-d864-a822a5d5d629"
202
+ },
203
+ {
204
+ "name": "NVIDIA H100 80GB HBM3",
205
+ "memoryTotal": "85520809984",
206
+ "cudaCores": 16896,
207
+ "architecture": "Hopper",
208
+ "uuid": "GPU-19d31f38-1be1-eced-5b78-b0d3f4deae56"
209
+ },
210
+ {
211
+ "name": "NVIDIA H100 80GB HBM3",
212
+ "memoryTotal": "85520809984",
213
+ "cudaCores": 16896,
214
+ "architecture": "Hopper",
215
+ "uuid": "GPU-1b41c967-636a-e3f1-5d74-2da616c06a3e"
216
+ },
217
+ {
218
+ "name": "NVIDIA H100 80GB HBM3",
219
+ "memoryTotal": "85520809984",
220
+ "cudaCores": 16896,
221
+ "architecture": "Hopper",
222
+ "uuid": "GPU-51d2a37c-3157-7b61-73b3-4a7914884549"
223
+ },
224
+ {
225
+ "name": "NVIDIA H100 80GB HBM3",
226
+ "memoryTotal": "85520809984",
227
+ "cudaCores": 16896,
228
+ "architecture": "Hopper",
229
+ "uuid": "GPU-499c302d-fc3c-c679-61e7-a2c4d46b8449"
230
+ },
231
+ {
232
+ "name": "NVIDIA H100 80GB HBM3",
233
+ "memoryTotal": "85520809984",
234
+ "cudaCores": 16896,
235
+ "architecture": "Hopper",
236
+ "uuid": "GPU-8eac440c-b327-20e6-1809-ef3e549d6c6d"
237
+ },
238
+ {
239
+ "name": "NVIDIA H100 80GB HBM3",
240
+ "memoryTotal": "85520809984",
241
+ "cudaCores": 16896,
242
+ "architecture": "Hopper",
243
+ "uuid": "GPU-5eba09b7-f1fc-555e-d07c-7b6227264759"
244
+ }
245
+ ],
246
+ "cudaVersion": "12.8",
247
+ "writerId": "9vte3cwjfuxykvlnatinaorhdm7hrpxl"
248
+ }
wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_timestamp":1.7588892349026182e+09,"throughput":142.40106310791458,"_wandb":{"runtime":310590},"_runtime":310590.549572079,"_step":43729,"iteration-time":7.31538462638855}
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-core.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-22T22:04:05.949917481Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpvwq5pw5s/port-40865.txt","pid":40865,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-09-22T22:04:05.951064803Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":40865}
3
+ {"time":"2025-09-22T22:04:05.951060304Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-40865-41784-466403699/socket","Net":"unix"}}
4
+ {"time":"2025-09-22T22:04:06.13426601Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-09-22T22:04:06.143173217Z","level":"INFO","msg":"handleInformInit: received","streamId":"hrldy3bw","id":"1(@)"}
6
+ {"time":"2025-09-22T22:04:06.487650443Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hrldy3bw","id":"1(@)"}
7
+ {"time":"2025-09-26T12:20:37.296353193Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-09-26T12:20:37.296955718Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-09-26T12:20:37.297053401Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
10
+ {"time":"2025-09-26T12:20:37.297072505Z","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2025-09-26T12:20:37.297624126Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-40865-41784-466403699/socket","Net":"unix"}}
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-09-22T22:04:06.143255097Z","level":"INFO","msg":"stream: starting","core version":"0.22.0"}
2
+ {"time":"2025-09-22T22:04:06.487372274Z","level":"INFO","msg":"stream: created new stream","id":"hrldy3bw"}
3
+ {"time":"2025-09-22T22:04:06.487645224Z","level":"INFO","msg":"stream: started","id":"hrldy3bw"}
4
+ {"time":"2025-09-22T22:04:06.487690464Z","level":"INFO","msg":"sender: started","stream_id":"hrldy3bw"}
5
+ {"time":"2025-09-22T22:04:06.487691275Z","level":"INFO","msg":"writer: started","stream_id":"hrldy3bw"}
6
+ {"time":"2025-09-22T22:04:06.487752261Z","level":"INFO","msg":"handler: started","stream_id":"hrldy3bw"}
7
+ {"time":"2025-09-23T20:24:34.768930029Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
8
+ {"time":"2025-09-24T15:33:04.643961764Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
9
+ {"time":"2025-09-24T15:48:05.092383968Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
10
+ {"time":"2025-09-24T16:41:14.895690245Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
11
+ {"time":"2025-09-25T16:26:04.894084919Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
12
+ {"time":"2025-09-25T17:26:05.511033911Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
13
+ {"time":"2025-09-25T22:42:38.263384097Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
14
+ {"time":"2025-09-25T23:02:46.550701182Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
15
+ {"time":"2025-09-26T12:20:37.296973248Z","level":"INFO","msg":"stream: closing","id":"hrldy3bw"}
16
+ {"time":"2025-09-26T12:20:38.848817943Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
17
+ {"time":"2025-09-26T12:20:39.03451843Z","level":"INFO","msg":"handler: closed","stream_id":"hrldy3bw"}
18
+ {"time":"2025-09-26T12:20:39.034655715Z","level":"INFO","msg":"sender: closed","stream_id":"hrldy3bw"}
19
+ {"time":"2025-09-26T12:20:39.034688219Z","level":"INFO","msg":"stream: closed","id":"hrldy3bw"}
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Current SDK version is 0.22.0
2
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Configure stats pid to 40865
3
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
4
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /home/ubuntu/training/Ling-V2/wandb/settings
5
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from environment variables
6
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():686] Logging user logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
7
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
8
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():813] calling init triggers
9
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
10
+ config: {'num_layers': 16, 'encoder_num_layers': 16, 'decoder_num_layers': None, 'hidden_size': 2048, 'ffn_hidden_size': 2048, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 4, 'max_position_embeddings': 8192, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 84000, 'rotary_percent': 0.5, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': True, 'multi_latent_attention': False, 'mtp_num_layers': 0, 'mtp_loss_scaling_factor': 0.1, 'bias_zero_mean_update': True, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 8, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': None, 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': None, 'recompute_num_layers': None, 'recompute_modules': None, 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': False, 'train_sync_interval': None, 'train_iters': 50000, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': 'poziomka_5/runs', 'masked_softmax_fusion': False, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': True, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'single', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': True, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 50, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'init_method_xavier_uniform': False, 'lr': 0.0008, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': None, 'lr_warmup_iters': 100, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 4e-05, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': 'poziomka_5', 'save_interval': 1600, 'no_save_optim': None, 'no_save_rng': None, 'load': None, 'no_load_optim': None, 'no_load_rng': None, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': False, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': True, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': False, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': False, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': True, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 4, 'encoder_tensor_model_parallel_size': 0, 'pipeline_model_parallel_size': 1, 'encoder_pipeline_model_parallel_size': 0, 'pipeline_model_parallel_split_rank': None, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 10, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': False, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': False, 'nccl_ub': False, 'use_sharp': False, 'use_custom_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache_when_using_custom_fsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'eval_iters': 2, 'eval_interval': 1600, 'test_mode': False, 'skip_train': False, 'data_path': ['szypulka_tokenized_apt4_merged/apt4_merged_text_document'], 'split': '9999,1,0', 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 8192, 'encoder_seq_length': 8192, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 2, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': True, 'create_attention_mask_in_dataloader': False, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': 32000, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': 'HuggingFaceTokenizer', 'tokenizer_model': '/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4', 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 2, 'expert_tensor_parallel_size': 1, 'num_experts': 128, 'moe_layer_freq': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'moe_ffn_hidden_size': 320, 'moe_shared_expert_intermediate_size': 320, 'moe_shared_expert_overlap': True, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'skip_casting_dtype_for_param_pattern': '["^expert_bias$|.+\\.expert_bias$"]', 'moe_router_score_function': 'sigmoid', 'moe_router_topk': 4, 'moe_router_pre_softmax': False, 'moe_router_num_groups': 8, 'moe_router_group_topk': 2, 'moe_router_topk_scaling_factor': 2.5, 'moe_router_enable_expert_bias': True, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': 3.5e-06, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': 1.25, 'moe_pad_expert_input_to_capacity': True, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 40.0, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': True, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 1000, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': True, 'wandb_project': 'poziomka', 'wandb_exp_name': 'poziomka_5', 'wandb_save_dir': '', 'logging_level': None, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': False, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'disabled', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, 'padded_vocab_size': 32256, '_wandb': {}}
11
+ 2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():861] starting backend
12
+ 2025-09-22 22:04:06,134 INFO MainThread:40865 [wandb_init.py:init():864] sending inform_init request
13
+ 2025-09-22 22:04:06,137 INFO MainThread:40865 [wandb_init.py:init():872] backend started and connected
14
+ 2025-09-22 22:04:06,140 INFO MainThread:40865 [wandb_init.py:init():942] updated telemetry
15
+ 2025-09-22 22:04:06,144 INFO MainThread:40865 [wandb_init.py:init():966] communicating run to backend with 90.0 second timeout
16
+ 2025-09-22 22:04:06,744 INFO MainThread:40865 [wandb_init.py:init():1017] starting run threads in backend
17
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_console_start():2506] atexit reg
18
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2354] redirect: wrap_raw
19
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2423] Wrapping output streams.
20
+ 2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2446] Redirects installed.
21
+ 2025-09-22 22:04:06,838 INFO MainThread:40865 [wandb_init.py:init():1057] run started, returning control to user process
22
+ 2025-09-26 12:20:37,273 INFO wandb-AsyncioManager-main:40865 [service_client.py:_forward_responses():84] Reached EOF.
23
+ 2025-09-26 12:20:37,275 INFO wandb-AsyncioManager-main:40865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b28ec29fb9c8346c89c54dad96553569524449851aa8e3717d5c6f3b593f0eeb
3
+ size 112983277