Add files using upload-large-folder tool
Browse files- .gitattributes +3 -0
- latest_checkpointed_iteration.txt +1 -0
- latest_wandb_artifact_path.txt +1 -0
- log_0.txt +3 -0
- pip_list.txt +154 -0
- run_pretrain_poziomka_5.sh +200 -0
- wandb/wandb/debug-internal.log +19 -0
- wandb/wandb/debug.log +23 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/files/config.yaml +1288 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log +3 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/files/requirements.txt +173 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-metadata.json +248 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-summary.json +1 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-core.log +11 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log +19 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log +23 -0
- wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
log_0.txt filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb filter=lfs diff=lfs merge=lfs -text
|
latest_checkpointed_iteration.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
43200
|
latest_wandb_artifact_path.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
adamo1139-no/poziomka
|
log_0.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8549446fab3880bb4addaebfd74001956042f3d33c467cc48fa2821e2e63b9d5
|
| 3 |
+
size 16532379
|
pip_list.txt
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Package Version
|
| 2 |
+
------------------------ ----------------
|
| 3 |
+
aiohappyeyeballs 2.6.1
|
| 4 |
+
aiohttp 3.12.15
|
| 5 |
+
aiosignal 1.4.0
|
| 6 |
+
annotated-types 0.7.0
|
| 7 |
+
apex 0.1
|
| 8 |
+
async-timeout 5.0.1
|
| 9 |
+
attrs 21.2.0
|
| 10 |
+
Automat 20.2.0
|
| 11 |
+
Babel 2.8.0
|
| 12 |
+
bcrypt 3.2.0
|
| 13 |
+
bitsandbytes 0.47.0
|
| 14 |
+
blinker 1.4
|
| 15 |
+
certifi 2020.6.20
|
| 16 |
+
chardet 4.0.0
|
| 17 |
+
charset-normalizer 3.4.3
|
| 18 |
+
click 8.0.3
|
| 19 |
+
cloud-init 25.1.2
|
| 20 |
+
colorama 0.4.4
|
| 21 |
+
command-not-found 0.3
|
| 22 |
+
configobj 5.0.6
|
| 23 |
+
constantly 15.1.0
|
| 24 |
+
cryptography 3.4.8
|
| 25 |
+
datasets 4.1.1
|
| 26 |
+
dbus-python 1.2.18
|
| 27 |
+
dill 0.4.0
|
| 28 |
+
distro 1.7.0
|
| 29 |
+
distro-info 1.1+ubuntu0.2
|
| 30 |
+
einops 0.8.1
|
| 31 |
+
filelock 3.19.1
|
| 32 |
+
flash_attn_3 3.0.0b1
|
| 33 |
+
frozenlist 1.7.0
|
| 34 |
+
fsspec 2025.9.0
|
| 35 |
+
gitdb 4.0.12
|
| 36 |
+
GitPython 3.1.45
|
| 37 |
+
hf_transfer 0.1.9
|
| 38 |
+
hf-xet 1.1.10
|
| 39 |
+
httplib2 0.20.2
|
| 40 |
+
huggingface-hub 0.35.0
|
| 41 |
+
hyperlink 21.0.0
|
| 42 |
+
idna 3.3
|
| 43 |
+
importlib-metadata 4.6.4
|
| 44 |
+
incremental 21.3.0
|
| 45 |
+
jeepney 0.7.1
|
| 46 |
+
Jinja2 3.0.3
|
| 47 |
+
jsonpatch 1.32
|
| 48 |
+
jsonpointer 2.0
|
| 49 |
+
jsonschema 3.2.0
|
| 50 |
+
keyring 23.5.0
|
| 51 |
+
launchpadlib 1.10.16
|
| 52 |
+
lazr.restfulclient 0.14.4
|
| 53 |
+
lazr.uri 1.0.6
|
| 54 |
+
MarkupSafe 2.0.1
|
| 55 |
+
megatron-core 0.13.0
|
| 56 |
+
ml_dtypes 0.5.3
|
| 57 |
+
more-itertools 8.10.0
|
| 58 |
+
mpmath 1.3.0
|
| 59 |
+
multidict 6.6.4
|
| 60 |
+
multiprocess 0.70.16
|
| 61 |
+
netifaces 0.11.0
|
| 62 |
+
networkx 3.3
|
| 63 |
+
ninja 1.13.0
|
| 64 |
+
numpy 1.26.4
|
| 65 |
+
nvidia-cublas-cu12 12.9.1.4
|
| 66 |
+
nvidia-cuda-cupti-cu12 12.9.79
|
| 67 |
+
nvidia-cuda-nvrtc-cu12 12.9.86
|
| 68 |
+
nvidia-cuda-runtime-cu12 12.9.79
|
| 69 |
+
nvidia-cudnn-cu12 9.10.2.21
|
| 70 |
+
nvidia-cufft-cu12 11.4.1.4
|
| 71 |
+
nvidia-cufile-cu12 1.14.1.1
|
| 72 |
+
nvidia-curand-cu12 10.3.10.19
|
| 73 |
+
nvidia-cusolver-cu12 11.7.5.82
|
| 74 |
+
nvidia-cusparse-cu12 12.5.10.65
|
| 75 |
+
nvidia-cusparselt-cu12 0.7.1
|
| 76 |
+
nvidia-nccl-cu12 2.27.3
|
| 77 |
+
nvidia-nvjitlink-cu12 12.9.86
|
| 78 |
+
nvidia-nvtx-cu12 12.9.79
|
| 79 |
+
oauthlib 3.2.0
|
| 80 |
+
onnx 1.19.0
|
| 81 |
+
onnx-ir 0.1.9
|
| 82 |
+
onnxscript 0.3.1
|
| 83 |
+
packaging 24.2
|
| 84 |
+
pandas 2.3.2
|
| 85 |
+
pexpect 4.8.0
|
| 86 |
+
pillow 11.0.0
|
| 87 |
+
pip 25.2
|
| 88 |
+
platformdirs 4.4.0
|
| 89 |
+
propcache 0.3.2
|
| 90 |
+
protobuf 6.32.1
|
| 91 |
+
psutil 7.1.0
|
| 92 |
+
psutils 3.3.11
|
| 93 |
+
ptyprocess 0.7.0
|
| 94 |
+
puremagic 1.30
|
| 95 |
+
pyarrow 21.0.0
|
| 96 |
+
pyasn1 0.4.8
|
| 97 |
+
pyasn1-modules 0.2.1
|
| 98 |
+
pybind11 3.0.1
|
| 99 |
+
pydantic 2.11.9
|
| 100 |
+
pydantic_core 2.33.2
|
| 101 |
+
PyGObject 3.42.1
|
| 102 |
+
PyHamcrest 2.0.2
|
| 103 |
+
PyJWT 2.3.0
|
| 104 |
+
pyOpenSSL 21.0.0
|
| 105 |
+
pyparsing 2.4.7
|
| 106 |
+
pypdf 6.1.0
|
| 107 |
+
pyrsistent 0.18.1
|
| 108 |
+
pyserial 3.5
|
| 109 |
+
python-apt 2.4.0+ubuntu4
|
| 110 |
+
python-dateutil 2.9.0.post0
|
| 111 |
+
python-debian 0.1.43+ubuntu1.1
|
| 112 |
+
python-magic 0.4.24
|
| 113 |
+
pytz 2022.1
|
| 114 |
+
PyYAML 5.4.1
|
| 115 |
+
regex 2025.9.18
|
| 116 |
+
requests 2.32.5
|
| 117 |
+
safetensors 0.6.2
|
| 118 |
+
SecretStorage 3.3.1
|
| 119 |
+
sentry-sdk 2.38.0
|
| 120 |
+
service-identity 18.1.0
|
| 121 |
+
setuptools 80.9.0
|
| 122 |
+
six 1.16.0
|
| 123 |
+
smmap 5.0.2
|
| 124 |
+
sos 4.8.2
|
| 125 |
+
ssh-import-id 5.11
|
| 126 |
+
sympy 1.13.3
|
| 127 |
+
systemd-python 234
|
| 128 |
+
tiktoken 0.11.0
|
| 129 |
+
tokenizers 0.22.1
|
| 130 |
+
torch 2.8.0+cu129
|
| 131 |
+
torchvision 0.23.0+cu129
|
| 132 |
+
tqdm 4.67.1
|
| 133 |
+
transformer_engine 2.6.0.post1
|
| 134 |
+
transformer_engine_cu12 2.6.0.post1
|
| 135 |
+
transformer_engine_torch 2.6.0.post1
|
| 136 |
+
transformers 4.56.2
|
| 137 |
+
triton 3.4.0
|
| 138 |
+
Twisted 22.1.0
|
| 139 |
+
typing_extensions 4.15.0
|
| 140 |
+
typing-inspection 0.4.1
|
| 141 |
+
tzdata 2025.2
|
| 142 |
+
ubuntu-drivers-common 0.0.0
|
| 143 |
+
ubuntu-pro-client 8001
|
| 144 |
+
ufw 0.36.1
|
| 145 |
+
unattended-upgrades 0.1
|
| 146 |
+
urllib3 2.5.0
|
| 147 |
+
wadllib 1.3.6
|
| 148 |
+
wandb 0.22.0
|
| 149 |
+
wheel 0.45.1
|
| 150 |
+
xkit 0.0.0
|
| 151 |
+
xxhash 3.5.0
|
| 152 |
+
yarl 1.20.1
|
| 153 |
+
zipp 1.0.0
|
| 154 |
+
zope.interface 5.4.0
|
run_pretrain_poziomka_5.sh
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -ex
|
| 3 |
+
|
| 4 |
+
MODEL_PATH="" # no checkpoint needed for from-scratch training
|
| 5 |
+
JOB_DIR="poziomka_5"
|
| 6 |
+
DATA_PATH="szypulka_tokenized_apt4_merged/apt4_merged_text_document"
|
| 7 |
+
MEGATRON_PATH="Megatron-LM-core_v0.13.0"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
mkdir -p ${JOB_DIR}
|
| 11 |
+
CHECKPOINT_PATH=${JOB_DIR}
|
| 12 |
+
TENSORBOARD_LOGS_PATH=${JOB_DIR}/runs
|
| 13 |
+
|
| 14 |
+
if [[ $RANK -eq 0 ]]; then
|
| 15 |
+
cp -r ${0} ${JOB_DIR}
|
| 16 |
+
pip list > ${JOB_DIR}/pip_list.txt
|
| 17 |
+
python -m torch.utils.collect_env > ${JOB_DIR}/collect_env.txt
|
| 18 |
+
fi
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
|
| 22 |
+
WORLD_SIZE=${WORLD_SIZE:-1}
|
| 23 |
+
NODE_RANK=${RANK:-0}
|
| 24 |
+
MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
|
| 25 |
+
RANDOM_PORT=$[$RANDOM + 20000]
|
| 26 |
+
MASTER_PORT=${MASTER_PORT:-$RANDOM_PORT}
|
| 27 |
+
GPU_NUM=$((${GPUS_PER_NODE}*${WORLD_SIZE}))
|
| 28 |
+
echo "---> from pytorch runtime, WORLD_SIZE: ${WORLD_SIZE}, NODE_RANK: ${NODE_RANK}, MASTER_ADDR: ${MASTER_ADDR}, MASTER_PORT: ${MASTER_PORT}"
|
| 29 |
+
LAUNCHER=" \
|
| 30 |
+
torchrun \
|
| 31 |
+
--nproc_per_node ${GPUS_PER_NODE} \
|
| 32 |
+
--nnodes ${WORLD_SIZE} \
|
| 33 |
+
--node_rank ${NODE_RANK} \
|
| 34 |
+
--master_addr ${MASTER_ADDR} \
|
| 35 |
+
--master_port ${MASTER_PORT} \
|
| 36 |
+
"
|
| 37 |
+
|
| 38 |
+
LOG_PATH="${JOB_DIR}/log_${NODE_RANK}.txt"
|
| 39 |
+
|
| 40 |
+
export OMP_NUM_THREADS=1
|
| 41 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # needed to keep at 1 as per https://github.com/NVIDIA/Megatron-LM/issues/533
|
| 42 |
+
export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
|
| 43 |
+
export NCCL_NVLS_ENABLE=0
|
| 44 |
+
export NCCL_CUMEM_ENABLE=0
|
| 45 |
+
|
| 46 |
+
export NVTE_FLASH_ATTN=1 # get that sweet FA3 boost
|
| 47 |
+
export NVTE_FUSED_ATTN=0
|
| 48 |
+
export NVTE_UNFUSED_ATTN=0
|
| 49 |
+
|
| 50 |
+
export NVTE_DEBUG=1
|
| 51 |
+
export NVTE_DEBUG_LEVEL=2 # 2 means DEBUG level
|
| 52 |
+
|
| 53 |
+
export NCCL_DEBUG=OFF
|
| 54 |
+
|
| 55 |
+
DEVICE_MODEL=$(nvidia-smi -i 0 -q | grep "Product Name" | awk -F: '{ print $2 }')
|
| 56 |
+
DEVICE_MODEL=$(echo "$DEVICE_MODEL" | xargs) # drop white space
|
| 57 |
+
|
| 58 |
+
if [[ $DEVICE_MODEL == NVIDIA* ]]; then
|
| 59 |
+
DEVICE_MODEL=${DEVICE_MODEL#"NVIDIA"}
|
| 60 |
+
DEVICE_MODEL=$(echo "$DEVICE_MODEL" | sed 's/^ *//')
|
| 61 |
+
fi
|
| 62 |
+
|
| 63 |
+
if [ "$DEVICE_MODEL" = "NVIDIA GeForce RTX 3090 Ti" ] || [ "$DEVICE_MODEL" = "A100-SXM4-80GB" ]; then
|
| 64 |
+
# Ampere GPUs do not support multicast. If `--tp-comm-overlap` is set on Ampere-arch GPUs, this env must be set.
|
| 65 |
+
export UB_SKIPMC=1
|
| 66 |
+
fi
|
| 67 |
+
|
| 68 |
+
MOE_ARGS=(
|
| 69 |
+
--expert-model-parallel-size 2
|
| 70 |
+
--expert-tensor-parallel-size 1
|
| 71 |
+
--moe-grouped-gemm
|
| 72 |
+
--moe-token-dispatcher-type alltoall
|
| 73 |
+
--moe-router-dtype fp32
|
| 74 |
+
--num-experts 128
|
| 75 |
+
--moe-ffn-hidden-size 320
|
| 76 |
+
--moe-shared-expert-intermediate-size 320
|
| 77 |
+
--moe-router-score-function sigmoid
|
| 78 |
+
--moe-router-topk 4
|
| 79 |
+
--moe-router-enable-expert-bias
|
| 80 |
+
--moe-router-topk-scaling-factor 2.5
|
| 81 |
+
--moe-router-num-groups 8
|
| 82 |
+
--moe-router-group-topk 2
|
| 83 |
+
--moe-z-loss-coeff 0.0000035
|
| 84 |
+
--moe-router-bias-update-rate 1e-3
|
| 85 |
+
--moe-layer-freq [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
|
| 86 |
+
--bias-zero-mean-update
|
| 87 |
+
--moe-expert-capacity-factor 1.25
|
| 88 |
+
--moe-pad-expert-input-to-capacity
|
| 89 |
+
--moe-shared-expert-overlap
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
MPT_ARGS=(
|
| 93 |
+
--mtp-num-layers 0
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
GPT_MODEL_ARGS=(
|
| 97 |
+
--num-layers 16
|
| 98 |
+
--hidden-size 2048
|
| 99 |
+
--ffn-hidden-size 2048
|
| 100 |
+
--num-attention-heads 16
|
| 101 |
+
--num-query-groups 4
|
| 102 |
+
--group-query-attention
|
| 103 |
+
--qk-layernorm
|
| 104 |
+
--use-flash-attn
|
| 105 |
+
--max-position-embeddings 8192
|
| 106 |
+
--vocab-size 32000
|
| 107 |
+
--make-vocab-size-divisible-by 128
|
| 108 |
+
--position-embedding-type "rope"
|
| 109 |
+
--rotary-base 84000
|
| 110 |
+
--rotary-percent 0.5
|
| 111 |
+
--rotary-scaling-factor 40
|
| 112 |
+
--swiglu
|
| 113 |
+
--untie-embeddings-and-output-weights
|
| 114 |
+
--normalization "RMSNorm"
|
| 115 |
+
--norm-epsilon "1e-06"
|
| 116 |
+
--disable-bias-linear
|
| 117 |
+
--transformer-impl "transformer_engine"
|
| 118 |
+
--attention-dropout 0
|
| 119 |
+
--hidden-dropout 0
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
TRAINING_ARGS=(
|
| 123 |
+
--micro-batch-size 8
|
| 124 |
+
--global-batch-size 256
|
| 125 |
+
--seq-length 8192
|
| 126 |
+
--train-iters 50000
|
| 127 |
+
--weight-decay 0.1
|
| 128 |
+
--adam-beta1 0.9
|
| 129 |
+
--adam-beta2 0.95
|
| 130 |
+
--init-method-std 0.02
|
| 131 |
+
--clip-grad 1.0
|
| 132 |
+
|
| 133 |
+
--bf16
|
| 134 |
+
|
| 135 |
+
--optimizer "adam"
|
| 136 |
+
--lr "8.0e-4"
|
| 137 |
+
--lr-decay-style cosine
|
| 138 |
+
--min-lr "4.00e-5"
|
| 139 |
+
--lr-warmup-iters 100
|
| 140 |
+
--seed 50
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
MODEL_PARALLEL_ARGS=(
|
| 144 |
+
--pipeline-model-parallel-size 1
|
| 145 |
+
--tensor-model-parallel-size 4
|
| 146 |
+
--sequence-parallel
|
| 147 |
+
--overlap-grad-reduce
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
DATA_ARGS=(
|
| 151 |
+
--data-path ${DATA_PATH}
|
| 152 |
+
--tokenizer-type "HuggingFaceTokenizer"
|
| 153 |
+
--tokenizer-model `dirname $(readlink -f "${BASH_SOURCE[0]}")`/../../resource/tokenizer/apt4
|
| 154 |
+
--split 9999,1,0
|
| 155 |
+
--dataloader-type "single"
|
| 156 |
+
--no-create-attention-mask-in-dataloader
|
| 157 |
+
--eod-mask-loss
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
EVAL_AND_LOGGING_ARGS=(
|
| 161 |
+
--save-interval 1600
|
| 162 |
+
--eval-interval 1600
|
| 163 |
+
--eval-iters 2
|
| 164 |
+
--save $CHECKPOINT_PATH
|
| 165 |
+
--ckpt-format "torch_dist"
|
| 166 |
+
--async-save
|
| 167 |
+
--log-interval 1
|
| 168 |
+
--log-throughput
|
| 169 |
+
--tensorboard-dir $TENSORBOARD_LOGS_PATH
|
| 170 |
+
--log-timers-to-tensorboard
|
| 171 |
+
--log-memory-to-tensorboard
|
| 172 |
+
--log-world-size-to-tensorboard
|
| 173 |
+
--log-validation-ppl-to-tensorboard
|
| 174 |
+
|
| 175 |
+
--wandb-project "poziomka"
|
| 176 |
+
--wandb-exp-name ${JOB_DIR}
|
| 177 |
+
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
KERNEL_ARGS=(
|
| 181 |
+
--attention-backend flash
|
| 182 |
+
--no-masked-softmax-fusion
|
| 183 |
+
--attention-softmax-in-fp32
|
| 184 |
+
--cross-entropy-loss-fusion
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
CMD="${LAUNCHER} ${MEGATRON_PATH}/pretrain_gpt.py \
|
| 188 |
+
${MOE_ARGS[@]} \
|
| 189 |
+
${GPT_MODEL_ARGS[@]} \
|
| 190 |
+
${TRAINING_ARGS[@]} \
|
| 191 |
+
${MODEL_PARALLEL_ARGS[@]} \
|
| 192 |
+
${DATA_ARGS[@]} \
|
| 193 |
+
${EVAL_AND_LOGGING_ARGS[@]} \
|
| 194 |
+
${KERNEL_ARGS[@]} \
|
| 195 |
+
${MPT_ARGS[@]} \
|
| 196 |
+
${PROFILING_ARGS[@]} \
|
| 197 |
+
"
|
| 198 |
+
|
| 199 |
+
echo ${CMD}
|
| 200 |
+
PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH ${CMD} 2>&1 | tee ${LOG_PATH}
|
wandb/wandb/debug-internal.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-09-22T22:04:06.143255097Z","level":"INFO","msg":"stream: starting","core version":"0.22.0"}
|
| 2 |
+
{"time":"2025-09-22T22:04:06.487372274Z","level":"INFO","msg":"stream: created new stream","id":"hrldy3bw"}
|
| 3 |
+
{"time":"2025-09-22T22:04:06.487645224Z","level":"INFO","msg":"stream: started","id":"hrldy3bw"}
|
| 4 |
+
{"time":"2025-09-22T22:04:06.487690464Z","level":"INFO","msg":"sender: started","stream_id":"hrldy3bw"}
|
| 5 |
+
{"time":"2025-09-22T22:04:06.487691275Z","level":"INFO","msg":"writer: started","stream_id":"hrldy3bw"}
|
| 6 |
+
{"time":"2025-09-22T22:04:06.487752261Z","level":"INFO","msg":"handler: started","stream_id":"hrldy3bw"}
|
| 7 |
+
{"time":"2025-09-23T20:24:34.768930029Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 8 |
+
{"time":"2025-09-24T15:33:04.643961764Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 9 |
+
{"time":"2025-09-24T15:48:05.092383968Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 10 |
+
{"time":"2025-09-24T16:41:14.895690245Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 11 |
+
{"time":"2025-09-25T16:26:04.894084919Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 12 |
+
{"time":"2025-09-25T17:26:05.511033911Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 13 |
+
{"time":"2025-09-25T22:42:38.263384097Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 14 |
+
{"time":"2025-09-25T23:02:46.550701182Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 15 |
+
{"time":"2025-09-26T12:20:37.296973248Z","level":"INFO","msg":"stream: closing","id":"hrldy3bw"}
|
| 16 |
+
{"time":"2025-09-26T12:20:38.848817943Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 17 |
+
{"time":"2025-09-26T12:20:39.03451843Z","level":"INFO","msg":"handler: closed","stream_id":"hrldy3bw"}
|
| 18 |
+
{"time":"2025-09-26T12:20:39.034655715Z","level":"INFO","msg":"sender: closed","stream_id":"hrldy3bw"}
|
| 19 |
+
{"time":"2025-09-26T12:20:39.034688219Z","level":"INFO","msg":"stream: closed","id":"hrldy3bw"}
|
wandb/wandb/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Current SDK version is 0.22.0
|
| 2 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Configure stats pid to 40865
|
| 3 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /home/ubuntu/training/Ling-V2/wandb/settings
|
| 5 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 6 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():686] Logging user logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
|
| 7 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
|
| 8 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():813] calling init triggers
|
| 9 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'num_layers': 16, 'encoder_num_layers': 16, 'decoder_num_layers': None, 'hidden_size': 2048, 'ffn_hidden_size': 2048, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 4, 'max_position_embeddings': 8192, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 84000, 'rotary_percent': 0.5, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': True, 'multi_latent_attention': False, 'mtp_num_layers': 0, 'mtp_loss_scaling_factor': 0.1, 'bias_zero_mean_update': True, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 8, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': None, 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': None, 'recompute_num_layers': None, 'recompute_modules': None, 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': False, 'train_sync_interval': None, 'train_iters': 50000, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': 'poziomka_5/runs', 'masked_softmax_fusion': False, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': True, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'single', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': True, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 50, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'init_method_xavier_uniform': False, 'lr': 0.0008, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': None, 'lr_warmup_iters': 100, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 4e-05, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': 'poziomka_5', 'save_interval': 1600, 'no_save_optim': None, 'no_save_rng': None, 'load': None, 'no_load_optim': None, 'no_load_rng': None, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': False, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': True, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': False, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': False, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': True, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 4, 'encoder_tensor_model_parallel_size': 0, 'pipeline_model_parallel_size': 1, 'encoder_pipeline_model_parallel_size': 0, 'pipeline_model_parallel_split_rank': None, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 10, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': False, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': False, 'nccl_ub': False, 'use_sharp': False, 'use_custom_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache_when_using_custom_fsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'eval_iters': 2, 'eval_interval': 1600, 'test_mode': False, 'skip_train': False, 'data_path': ['szypulka_tokenized_apt4_merged/apt4_merged_text_document'], 'split': '9999,1,0', 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 8192, 'encoder_seq_length': 8192, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 2, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': True, 'create_attention_mask_in_dataloader': False, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': 32000, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': 'HuggingFaceTokenizer', 'tokenizer_model': '/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4', 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 2, 'expert_tensor_parallel_size': 1, 'num_experts': 128, 'moe_layer_freq': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'moe_ffn_hidden_size': 320, 'moe_shared_expert_intermediate_size': 320, 'moe_shared_expert_overlap': True, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'skip_casting_dtype_for_param_pattern': '["^expert_bias$|.+\\.expert_bias$"]', 'moe_router_score_function': 'sigmoid', 'moe_router_topk': 4, 'moe_router_pre_softmax': False, 'moe_router_num_groups': 8, 'moe_router_group_topk': 2, 'moe_router_topk_scaling_factor': 2.5, 'moe_router_enable_expert_bias': True, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': 3.5e-06, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': 1.25, 'moe_pad_expert_input_to_capacity': True, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 40.0, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': True, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 1000, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': True, 'wandb_project': 'poziomka', 'wandb_exp_name': 'poziomka_5', 'wandb_save_dir': '', 'logging_level': None, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': False, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'disabled', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, 'padded_vocab_size': 32256, '_wandb': {}}
|
| 11 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():861] starting backend
|
| 12 |
+
2025-09-22 22:04:06,134 INFO MainThread:40865 [wandb_init.py:init():864] sending inform_init request
|
| 13 |
+
2025-09-22 22:04:06,137 INFO MainThread:40865 [wandb_init.py:init():872] backend started and connected
|
| 14 |
+
2025-09-22 22:04:06,140 INFO MainThread:40865 [wandb_init.py:init():942] updated telemetry
|
| 15 |
+
2025-09-22 22:04:06,144 INFO MainThread:40865 [wandb_init.py:init():966] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-09-22 22:04:06,744 INFO MainThread:40865 [wandb_init.py:init():1017] starting run threads in backend
|
| 17 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_console_start():2506] atexit reg
|
| 18 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2354] redirect: wrap_raw
|
| 19 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2423] Wrapping output streams.
|
| 20 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2446] Redirects installed.
|
| 21 |
+
2025-09-22 22:04:06,838 INFO MainThread:40865 [wandb_init.py:init():1057] run started, returning control to user process
|
| 22 |
+
2025-09-26 12:20:37,273 INFO wandb-AsyncioManager-main:40865 [service_client.py:_forward_responses():84] Reached EOF.
|
| 23 |
+
2025-09-26 12:20:37,275 INFO wandb-AsyncioManager-main:40865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
wandb/wandb/run-20250922_220405-hrldy3bw/files/config.yaml
ADDED
|
@@ -0,0 +1,1288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_wandb:
|
| 2 |
+
value:
|
| 3 |
+
cli_version: 0.22.0
|
| 4 |
+
e:
|
| 5 |
+
9vte3cwjfuxykvlnatinaorhdm7hrpxl:
|
| 6 |
+
args:
|
| 7 |
+
- --expert-model-parallel-size
|
| 8 |
+
- "2"
|
| 9 |
+
- --expert-tensor-parallel-size
|
| 10 |
+
- "1"
|
| 11 |
+
- --moe-grouped-gemm
|
| 12 |
+
- --moe-token-dispatcher-type
|
| 13 |
+
- alltoall
|
| 14 |
+
- --moe-router-dtype
|
| 15 |
+
- fp32
|
| 16 |
+
- --num-experts
|
| 17 |
+
- "128"
|
| 18 |
+
- --moe-ffn-hidden-size
|
| 19 |
+
- "320"
|
| 20 |
+
- --moe-shared-expert-intermediate-size
|
| 21 |
+
- "320"
|
| 22 |
+
- --moe-router-score-function
|
| 23 |
+
- sigmoid
|
| 24 |
+
- --moe-router-topk
|
| 25 |
+
- "4"
|
| 26 |
+
- --moe-router-enable-expert-bias
|
| 27 |
+
- --moe-router-topk-scaling-factor
|
| 28 |
+
- "2.5"
|
| 29 |
+
- --moe-router-num-groups
|
| 30 |
+
- "8"
|
| 31 |
+
- --moe-router-group-topk
|
| 32 |
+
- "2"
|
| 33 |
+
- --moe-z-loss-coeff
|
| 34 |
+
- "0.0000035"
|
| 35 |
+
- --moe-router-bias-update-rate
|
| 36 |
+
- "1e-3"
|
| 37 |
+
- --moe-layer-freq
|
| 38 |
+
- '[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'
|
| 39 |
+
- --bias-zero-mean-update
|
| 40 |
+
- --moe-expert-capacity-factor
|
| 41 |
+
- "1.25"
|
| 42 |
+
- --moe-pad-expert-input-to-capacity
|
| 43 |
+
- --moe-shared-expert-overlap
|
| 44 |
+
- --num-layers
|
| 45 |
+
- "16"
|
| 46 |
+
- --hidden-size
|
| 47 |
+
- "2048"
|
| 48 |
+
- --ffn-hidden-size
|
| 49 |
+
- "2048"
|
| 50 |
+
- --num-attention-heads
|
| 51 |
+
- "16"
|
| 52 |
+
- --num-query-groups
|
| 53 |
+
- "4"
|
| 54 |
+
- --group-query-attention
|
| 55 |
+
- --qk-layernorm
|
| 56 |
+
- --use-flash-attn
|
| 57 |
+
- --max-position-embeddings
|
| 58 |
+
- "8192"
|
| 59 |
+
- --vocab-size
|
| 60 |
+
- "32000"
|
| 61 |
+
- --make-vocab-size-divisible-by
|
| 62 |
+
- "128"
|
| 63 |
+
- --position-embedding-type
|
| 64 |
+
- rope
|
| 65 |
+
- --rotary-base
|
| 66 |
+
- "84000"
|
| 67 |
+
- --rotary-percent
|
| 68 |
+
- "0.5"
|
| 69 |
+
- --rotary-scaling-factor
|
| 70 |
+
- "40"
|
| 71 |
+
- --swiglu
|
| 72 |
+
- --untie-embeddings-and-output-weights
|
| 73 |
+
- --normalization
|
| 74 |
+
- RMSNorm
|
| 75 |
+
- --norm-epsilon
|
| 76 |
+
- "1e-06"
|
| 77 |
+
- --disable-bias-linear
|
| 78 |
+
- --transformer-impl
|
| 79 |
+
- transformer_engine
|
| 80 |
+
- --attention-dropout
|
| 81 |
+
- "0"
|
| 82 |
+
- --hidden-dropout
|
| 83 |
+
- "0"
|
| 84 |
+
- --micro-batch-size
|
| 85 |
+
- "8"
|
| 86 |
+
- --global-batch-size
|
| 87 |
+
- "256"
|
| 88 |
+
- --seq-length
|
| 89 |
+
- "8192"
|
| 90 |
+
- --train-iters
|
| 91 |
+
- "50000"
|
| 92 |
+
- --weight-decay
|
| 93 |
+
- "0.1"
|
| 94 |
+
- --adam-beta1
|
| 95 |
+
- "0.9"
|
| 96 |
+
- --adam-beta2
|
| 97 |
+
- "0.95"
|
| 98 |
+
- --init-method-std
|
| 99 |
+
- "0.02"
|
| 100 |
+
- --clip-grad
|
| 101 |
+
- "1.0"
|
| 102 |
+
- --bf16
|
| 103 |
+
- --optimizer
|
| 104 |
+
- adam
|
| 105 |
+
- --lr
|
| 106 |
+
- "8.0e-4"
|
| 107 |
+
- --lr-decay-style
|
| 108 |
+
- cosine
|
| 109 |
+
- --min-lr
|
| 110 |
+
- "4.00e-5"
|
| 111 |
+
- --lr-warmup-iters
|
| 112 |
+
- "100"
|
| 113 |
+
- --seed
|
| 114 |
+
- "50"
|
| 115 |
+
- --pipeline-model-parallel-size
|
| 116 |
+
- "1"
|
| 117 |
+
- --tensor-model-parallel-size
|
| 118 |
+
- "4"
|
| 119 |
+
- --sequence-parallel
|
| 120 |
+
- --overlap-grad-reduce
|
| 121 |
+
- --data-path
|
| 122 |
+
- szypulka_tokenized_apt4_merged/apt4_merged_text_document
|
| 123 |
+
- --tokenizer-type
|
| 124 |
+
- HuggingFaceTokenizer
|
| 125 |
+
- --tokenizer-model
|
| 126 |
+
- /home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4
|
| 127 |
+
- --split
|
| 128 |
+
- 9999,1,0
|
| 129 |
+
- --dataloader-type
|
| 130 |
+
- single
|
| 131 |
+
- --no-create-attention-mask-in-dataloader
|
| 132 |
+
- --eod-mask-loss
|
| 133 |
+
- --save-interval
|
| 134 |
+
- "1600"
|
| 135 |
+
- --eval-interval
|
| 136 |
+
- "1600"
|
| 137 |
+
- --eval-iters
|
| 138 |
+
- "2"
|
| 139 |
+
- --save
|
| 140 |
+
- poziomka_5
|
| 141 |
+
- --ckpt-format
|
| 142 |
+
- torch_dist
|
| 143 |
+
- --async-save
|
| 144 |
+
- --log-interval
|
| 145 |
+
- "1"
|
| 146 |
+
- --log-throughput
|
| 147 |
+
- --tensorboard-dir
|
| 148 |
+
- poziomka_5/runs
|
| 149 |
+
- --log-timers-to-tensorboard
|
| 150 |
+
- --log-memory-to-tensorboard
|
| 151 |
+
- --log-world-size-to-tensorboard
|
| 152 |
+
- --log-validation-ppl-to-tensorboard
|
| 153 |
+
- --wandb-project
|
| 154 |
+
- poziomka
|
| 155 |
+
- --wandb-exp-name
|
| 156 |
+
- poziomka_5
|
| 157 |
+
- --attention-backend
|
| 158 |
+
- flash
|
| 159 |
+
- --no-masked-softmax-fusion
|
| 160 |
+
- --attention-softmax-in-fp32
|
| 161 |
+
- --cross-entropy-loss-fusion
|
| 162 |
+
- --mtp-num-layers
|
| 163 |
+
- "0"
|
| 164 |
+
codePath: Megatron-LM-core_v0.13.0/pretrain_gpt.py
|
| 165 |
+
codePathLocal: Megatron-LM-core_v0.13.0/pretrain_gpt.py
|
| 166 |
+
cpu_count: 128
|
| 167 |
+
cpu_count_logical: 128
|
| 168 |
+
cudaVersion: "12.8"
|
| 169 |
+
disk:
|
| 170 |
+
/:
|
| 171 |
+
total: "2907329073152"
|
| 172 |
+
used: "390033698816"
|
| 173 |
+
email: adamo1139@gmail.com
|
| 174 |
+
executable: /usr/bin/python3
|
| 175 |
+
git:
|
| 176 |
+
commit: e3867293ebf444f614164c2b84180cd75e7de07c
|
| 177 |
+
remote: https://github.com/adamo1139/Ling-V2.git
|
| 178 |
+
gpu: NVIDIA H100 80GB HBM3
|
| 179 |
+
gpu_count: 8
|
| 180 |
+
gpu_nvidia:
|
| 181 |
+
- architecture: Hopper
|
| 182 |
+
cudaCores: 16896
|
| 183 |
+
memoryTotal: "85520809984"
|
| 184 |
+
name: NVIDIA H100 80GB HBM3
|
| 185 |
+
uuid: GPU-b4eb54a1-d73f-9179-04f2-b231b6a39a34
|
| 186 |
+
- architecture: Hopper
|
| 187 |
+
cudaCores: 16896
|
| 188 |
+
memoryTotal: "85520809984"
|
| 189 |
+
name: NVIDIA H100 80GB HBM3
|
| 190 |
+
uuid: GPU-742b8534-6865-3da0-d864-a822a5d5d629
|
| 191 |
+
- architecture: Hopper
|
| 192 |
+
cudaCores: 16896
|
| 193 |
+
memoryTotal: "85520809984"
|
| 194 |
+
name: NVIDIA H100 80GB HBM3
|
| 195 |
+
uuid: GPU-19d31f38-1be1-eced-5b78-b0d3f4deae56
|
| 196 |
+
- architecture: Hopper
|
| 197 |
+
cudaCores: 16896
|
| 198 |
+
memoryTotal: "85520809984"
|
| 199 |
+
name: NVIDIA H100 80GB HBM3
|
| 200 |
+
uuid: GPU-1b41c967-636a-e3f1-5d74-2da616c06a3e
|
| 201 |
+
- architecture: Hopper
|
| 202 |
+
cudaCores: 16896
|
| 203 |
+
memoryTotal: "85520809984"
|
| 204 |
+
name: NVIDIA H100 80GB HBM3
|
| 205 |
+
uuid: GPU-51d2a37c-3157-7b61-73b3-4a7914884549
|
| 206 |
+
- architecture: Hopper
|
| 207 |
+
cudaCores: 16896
|
| 208 |
+
memoryTotal: "85520809984"
|
| 209 |
+
name: NVIDIA H100 80GB HBM3
|
| 210 |
+
uuid: GPU-499c302d-fc3c-c679-61e7-a2c4d46b8449
|
| 211 |
+
- architecture: Hopper
|
| 212 |
+
cudaCores: 16896
|
| 213 |
+
memoryTotal: "85520809984"
|
| 214 |
+
name: NVIDIA H100 80GB HBM3
|
| 215 |
+
uuid: GPU-8eac440c-b327-20e6-1809-ef3e549d6c6d
|
| 216 |
+
- architecture: Hopper
|
| 217 |
+
cudaCores: 16896
|
| 218 |
+
memoryTotal: "85520809984"
|
| 219 |
+
name: NVIDIA H100 80GB HBM3
|
| 220 |
+
uuid: GPU-5eba09b7-f1fc-555e-d07c-7b6227264759
|
| 221 |
+
host: megatron6
|
| 222 |
+
memory:
|
| 223 |
+
total: "1014522519552"
|
| 224 |
+
os: Linux-5.15.0-143-generic-x86_64-with-glibc2.35
|
| 225 |
+
program: /home/ubuntu/training/Ling-V2/Megatron-LM-core_v0.13.0/pretrain_gpt.py
|
| 226 |
+
python: CPython 3.10.12
|
| 227 |
+
root: poziomka_5/wandb
|
| 228 |
+
startedAt: "2025-09-22T22:04:05.928052Z"
|
| 229 |
+
writerId: 9vte3cwjfuxykvlnatinaorhdm7hrpxl
|
| 230 |
+
m: []
|
| 231 |
+
python_version: 3.10.12
|
| 232 |
+
t:
|
| 233 |
+
"1":
|
| 234 |
+
- 1
|
| 235 |
+
- 11
|
| 236 |
+
- 49
|
| 237 |
+
"2":
|
| 238 |
+
- 1
|
| 239 |
+
- 11
|
| 240 |
+
- 49
|
| 241 |
+
"3":
|
| 242 |
+
- 13
|
| 243 |
+
- 16
|
| 244 |
+
- 61
|
| 245 |
+
"4": 3.10.12
|
| 246 |
+
"5": 0.22.0
|
| 247 |
+
"6": 4.56.2
|
| 248 |
+
"12": 0.22.0
|
| 249 |
+
"13": linux-x86_64
|
| 250 |
+
account_for_embedding_in_pipeline_split:
|
| 251 |
+
value: false
|
| 252 |
+
account_for_loss_in_pipeline_split:
|
| 253 |
+
value: false
|
| 254 |
+
accumulate_allreduce_grads_in_fp32:
|
| 255 |
+
value: true
|
| 256 |
+
adam_beta1:
|
| 257 |
+
value: 0.9
|
| 258 |
+
adam_beta2:
|
| 259 |
+
value: 0.95
|
| 260 |
+
adam_eps:
|
| 261 |
+
value: 1e-08
|
| 262 |
+
add_bias_linear:
|
| 263 |
+
value: false
|
| 264 |
+
add_position_embedding:
|
| 265 |
+
value: true
|
| 266 |
+
add_qkv_bias:
|
| 267 |
+
value: false
|
| 268 |
+
adlr_autoresume:
|
| 269 |
+
value: false
|
| 270 |
+
adlr_autoresume_interval:
|
| 271 |
+
value: 1000
|
| 272 |
+
align_grad_reduce:
|
| 273 |
+
value: true
|
| 274 |
+
align_param_gather:
|
| 275 |
+
value: false
|
| 276 |
+
app_tag_run_name:
|
| 277 |
+
value: null
|
| 278 |
+
app_tag_run_version:
|
| 279 |
+
value: 0.0.0
|
| 280 |
+
apply_layernorm_1p:
|
| 281 |
+
value: false
|
| 282 |
+
apply_query_key_layer_scaling:
|
| 283 |
+
value: false
|
| 284 |
+
apply_residual_connection_post_layernorm:
|
| 285 |
+
value: false
|
| 286 |
+
apply_rope_fusion:
|
| 287 |
+
value: true
|
| 288 |
+
async_save:
|
| 289 |
+
value: true
|
| 290 |
+
async_tensor_model_parallel_allreduce:
|
| 291 |
+
value: true
|
| 292 |
+
attention_backend:
|
| 293 |
+
value: flash
|
| 294 |
+
attention_dropout:
|
| 295 |
+
value: 0
|
| 296 |
+
attention_softmax_in_fp32:
|
| 297 |
+
value: true
|
| 298 |
+
auto_detect_ckpt_format:
|
| 299 |
+
value: false
|
| 300 |
+
barrier_with_L1_time:
|
| 301 |
+
value: true
|
| 302 |
+
bert_binary_head:
|
| 303 |
+
value: true
|
| 304 |
+
bert_embedder_type:
|
| 305 |
+
value: megatron
|
| 306 |
+
bert_load:
|
| 307 |
+
value: null
|
| 308 |
+
bf16:
|
| 309 |
+
value: true
|
| 310 |
+
bias_dropout_fusion:
|
| 311 |
+
value: true
|
| 312 |
+
bias_gelu_fusion:
|
| 313 |
+
value: false
|
| 314 |
+
bias_swiglu_fusion:
|
| 315 |
+
value: true
|
| 316 |
+
bias_zero_mean_update:
|
| 317 |
+
value: true
|
| 318 |
+
biencoder_projection_dim:
|
| 319 |
+
value: 0
|
| 320 |
+
biencoder_shared_query_context_model:
|
| 321 |
+
value: false
|
| 322 |
+
block_data_path:
|
| 323 |
+
value: null
|
| 324 |
+
calc_ft_timeouts:
|
| 325 |
+
value: false
|
| 326 |
+
calculate_per_token_loss:
|
| 327 |
+
value: false
|
| 328 |
+
check_for_large_grads:
|
| 329 |
+
value: false
|
| 330 |
+
check_for_nan_in_loss_and_grad:
|
| 331 |
+
value: true
|
| 332 |
+
check_for_spiky_loss:
|
| 333 |
+
value: false
|
| 334 |
+
check_weight_hash_across_dp_replicas_interval:
|
| 335 |
+
value: null
|
| 336 |
+
ckpt_assume_constant_structure:
|
| 337 |
+
value: false
|
| 338 |
+
ckpt_convert_format:
|
| 339 |
+
value: null
|
| 340 |
+
ckpt_convert_save:
|
| 341 |
+
value: null
|
| 342 |
+
ckpt_convert_update_legacy_dist_opt_format:
|
| 343 |
+
value: false
|
| 344 |
+
ckpt_format:
|
| 345 |
+
value: torch_dist
|
| 346 |
+
ckpt_fully_parallel_load:
|
| 347 |
+
value: false
|
| 348 |
+
ckpt_fully_parallel_save:
|
| 349 |
+
value: true
|
| 350 |
+
ckpt_fully_parallel_save_deprecated:
|
| 351 |
+
value: false
|
| 352 |
+
ckpt_step:
|
| 353 |
+
value: null
|
| 354 |
+
classes_fraction:
|
| 355 |
+
value: 1
|
| 356 |
+
clip_grad:
|
| 357 |
+
value: 1
|
| 358 |
+
clone_scatter_output_in_embedding:
|
| 359 |
+
value: true
|
| 360 |
+
config_logger_dir:
|
| 361 |
+
value: ""
|
| 362 |
+
consumed_train_samples:
|
| 363 |
+
value: 0
|
| 364 |
+
consumed_valid_samples:
|
| 365 |
+
value: 0
|
| 366 |
+
context_parallel_size:
|
| 367 |
+
value: 1
|
| 368 |
+
cp_comm_type:
|
| 369 |
+
value:
|
| 370 |
+
- p2p
|
| 371 |
+
create_attention_mask_in_dataloader:
|
| 372 |
+
value: false
|
| 373 |
+
cross_entropy_fusion_impl:
|
| 374 |
+
value: native
|
| 375 |
+
cross_entropy_loss_fusion:
|
| 376 |
+
value: true
|
| 377 |
+
cuda_graph_scope:
|
| 378 |
+
value: full
|
| 379 |
+
cuda_graph_warmup_steps:
|
| 380 |
+
value: 3
|
| 381 |
+
data_args_path:
|
| 382 |
+
value: null
|
| 383 |
+
data_cache_path:
|
| 384 |
+
value: null
|
| 385 |
+
data_parallel_random_init:
|
| 386 |
+
value: false
|
| 387 |
+
data_parallel_sharding_strategy:
|
| 388 |
+
value: no_shard
|
| 389 |
+
data_parallel_size:
|
| 390 |
+
value: 2
|
| 391 |
+
data_path:
|
| 392 |
+
value:
|
| 393 |
+
- szypulka_tokenized_apt4_merged/apt4_merged_text_document
|
| 394 |
+
data_per_class_fraction:
|
| 395 |
+
value: 1
|
| 396 |
+
data_sharding:
|
| 397 |
+
value: true
|
| 398 |
+
dataloader_type:
|
| 399 |
+
value: single
|
| 400 |
+
ddp_average_in_collective:
|
| 401 |
+
value: false
|
| 402 |
+
ddp_bucket_size:
|
| 403 |
+
value: null
|
| 404 |
+
ddp_num_buckets:
|
| 405 |
+
value: null
|
| 406 |
+
ddp_pad_buckets_for_high_nccl_busbw:
|
| 407 |
+
value: false
|
| 408 |
+
decoder_first_pipeline_num_layers:
|
| 409 |
+
value: null
|
| 410 |
+
decoder_last_pipeline_num_layers:
|
| 411 |
+
value: null
|
| 412 |
+
decoder_num_layers:
|
| 413 |
+
value: null
|
| 414 |
+
decoder_seq_length:
|
| 415 |
+
value: null
|
| 416 |
+
decoupled_lr:
|
| 417 |
+
value: null
|
| 418 |
+
decoupled_min_lr:
|
| 419 |
+
value: null
|
| 420 |
+
decrease_batch_size_if_needed:
|
| 421 |
+
value: false
|
| 422 |
+
defer_embedding_wgrad_compute:
|
| 423 |
+
value: false
|
| 424 |
+
delay_wgrad_compute:
|
| 425 |
+
value: false
|
| 426 |
+
deprecated_use_mcore_models:
|
| 427 |
+
value: false
|
| 428 |
+
deterministic_mode:
|
| 429 |
+
value: false
|
| 430 |
+
dino_bottleneck_size:
|
| 431 |
+
value: 256
|
| 432 |
+
dino_freeze_last_layer:
|
| 433 |
+
value: 1
|
| 434 |
+
dino_head_hidden_size:
|
| 435 |
+
value: 2048
|
| 436 |
+
dino_local_crops_number:
|
| 437 |
+
value: 10
|
| 438 |
+
dino_local_img_size:
|
| 439 |
+
value: 96
|
| 440 |
+
dino_norm_last_layer:
|
| 441 |
+
value: false
|
| 442 |
+
dino_teacher_temp:
|
| 443 |
+
value: 0.07
|
| 444 |
+
dino_warmup_teacher_temp:
|
| 445 |
+
value: 0.04
|
| 446 |
+
dino_warmup_teacher_temp_epochs:
|
| 447 |
+
value: 30
|
| 448 |
+
disable_bf16_reduced_precision_matmul:
|
| 449 |
+
value: false
|
| 450 |
+
disable_mamba_mem_eff_path:
|
| 451 |
+
value: false
|
| 452 |
+
disable_straggler_on_startup:
|
| 453 |
+
value: false
|
| 454 |
+
dist_ckpt_format_deprecated:
|
| 455 |
+
value: null
|
| 456 |
+
dist_ckpt_strictness:
|
| 457 |
+
value: assume_ok_unexpected
|
| 458 |
+
distribute_saved_activations:
|
| 459 |
+
value: false
|
| 460 |
+
distributed_backend:
|
| 461 |
+
value: nccl
|
| 462 |
+
distributed_timeout_minutes:
|
| 463 |
+
value: 10
|
| 464 |
+
embedding_path:
|
| 465 |
+
value: null
|
| 466 |
+
empty_unused_memory_level:
|
| 467 |
+
value: 0
|
| 468 |
+
enable_cuda_graph:
|
| 469 |
+
value: false
|
| 470 |
+
enable_experimental:
|
| 471 |
+
value: false
|
| 472 |
+
enable_ft_package:
|
| 473 |
+
value: false
|
| 474 |
+
enable_gloo_process_groups:
|
| 475 |
+
value: true
|
| 476 |
+
enable_msc:
|
| 477 |
+
value: true
|
| 478 |
+
enable_one_logger:
|
| 479 |
+
value: true
|
| 480 |
+
encoder_num_layers:
|
| 481 |
+
value: 16
|
| 482 |
+
encoder_pipeline_model_parallel_size:
|
| 483 |
+
value: 0
|
| 484 |
+
encoder_seq_length:
|
| 485 |
+
value: 8192
|
| 486 |
+
encoder_tensor_model_parallel_size:
|
| 487 |
+
value: 0
|
| 488 |
+
end_weight_decay:
|
| 489 |
+
value: 0.1
|
| 490 |
+
eod_mask_loss:
|
| 491 |
+
value: true
|
| 492 |
+
error_injection_rate:
|
| 493 |
+
value: 0
|
| 494 |
+
error_injection_type:
|
| 495 |
+
value: transient_error
|
| 496 |
+
eval_interval:
|
| 497 |
+
value: 1600
|
| 498 |
+
eval_iters:
|
| 499 |
+
value: 2
|
| 500 |
+
evidence_data_path:
|
| 501 |
+
value: null
|
| 502 |
+
exit_duration_in_mins:
|
| 503 |
+
value: null
|
| 504 |
+
exit_interval:
|
| 505 |
+
value: null
|
| 506 |
+
exit_on_missing_checkpoint:
|
| 507 |
+
value: false
|
| 508 |
+
exit_signal_handler:
|
| 509 |
+
value: false
|
| 510 |
+
exp_avg_dtype:
|
| 511 |
+
value: torch.float32
|
| 512 |
+
exp_avg_sq_dtype:
|
| 513 |
+
value: torch.float32
|
| 514 |
+
expert_model_parallel_size:
|
| 515 |
+
value: 2
|
| 516 |
+
expert_tensor_parallel_size:
|
| 517 |
+
value: 1
|
| 518 |
+
external_cuda_graph:
|
| 519 |
+
value: false
|
| 520 |
+
ffn_hidden_size:
|
| 521 |
+
value: 2048
|
| 522 |
+
finetune:
|
| 523 |
+
value: false
|
| 524 |
+
first_last_layers_bf16:
|
| 525 |
+
value: false
|
| 526 |
+
flash_decode:
|
| 527 |
+
value: false
|
| 528 |
+
fp8:
|
| 529 |
+
value: null
|
| 530 |
+
fp8_amax_compute_algo:
|
| 531 |
+
value: most_recent
|
| 532 |
+
fp8_amax_history_len:
|
| 533 |
+
value: 1
|
| 534 |
+
fp8_interval:
|
| 535 |
+
value: 1
|
| 536 |
+
fp8_margin:
|
| 537 |
+
value: 0
|
| 538 |
+
fp8_param_gather:
|
| 539 |
+
value: false
|
| 540 |
+
fp8_recipe:
|
| 541 |
+
value: delayed
|
| 542 |
+
fp8_wgrad:
|
| 543 |
+
value: true
|
| 544 |
+
fp16:
|
| 545 |
+
value: false
|
| 546 |
+
fp16_lm_cross_entropy:
|
| 547 |
+
value: false
|
| 548 |
+
fp32_residual_connection:
|
| 549 |
+
value: false
|
| 550 |
+
fsdp_double_buffer:
|
| 551 |
+
value: false
|
| 552 |
+
global_batch_size:
|
| 553 |
+
value: 256
|
| 554 |
+
grad_reduce_in_bf16:
|
| 555 |
+
value: false
|
| 556 |
+
gradient_accumulation_fusion:
|
| 557 |
+
value: true
|
| 558 |
+
gradient_reduce_div_fusion:
|
| 559 |
+
value: true
|
| 560 |
+
group_query_attention:
|
| 561 |
+
value: true
|
| 562 |
+
head_lr_mult:
|
| 563 |
+
value: 1
|
| 564 |
+
heterogeneous_layers_config_encoded_json:
|
| 565 |
+
value: null
|
| 566 |
+
heterogeneous_layers_config_path:
|
| 567 |
+
value: null
|
| 568 |
+
hidden_dropout:
|
| 569 |
+
value: 0
|
| 570 |
+
hidden_size:
|
| 571 |
+
value: 2048
|
| 572 |
+
hierarchical_context_parallel_sizes:
|
| 573 |
+
value: null
|
| 574 |
+
high_priority_stream_groups:
|
| 575 |
+
value: []
|
| 576 |
+
hybrid_attention_ratio:
|
| 577 |
+
value: 0
|
| 578 |
+
hybrid_mlp_ratio:
|
| 579 |
+
value: 0
|
| 580 |
+
hybrid_override_pattern:
|
| 581 |
+
value: null
|
| 582 |
+
hysteresis:
|
| 583 |
+
value: 2
|
| 584 |
+
ict_head_size:
|
| 585 |
+
value: null
|
| 586 |
+
ict_load:
|
| 587 |
+
value: null
|
| 588 |
+
img_h:
|
| 589 |
+
value: 224
|
| 590 |
+
img_w:
|
| 591 |
+
value: 224
|
| 592 |
+
indexer_batch_size:
|
| 593 |
+
value: 128
|
| 594 |
+
indexer_log_interval:
|
| 595 |
+
value: 1000
|
| 596 |
+
inference_batch_times_seqlen_threshold:
|
| 597 |
+
value: -1
|
| 598 |
+
inference_dynamic_batching:
|
| 599 |
+
value: false
|
| 600 |
+
inference_dynamic_batching_buffer_guaranteed_fraction:
|
| 601 |
+
value: 0.2
|
| 602 |
+
inference_dynamic_batching_buffer_overflow_factor:
|
| 603 |
+
value: null
|
| 604 |
+
inference_dynamic_batching_buffer_size_gb:
|
| 605 |
+
value: 40
|
| 606 |
+
inference_dynamic_batching_chunk_size:
|
| 607 |
+
value: 256
|
| 608 |
+
inference_dynamic_batching_max_requests_override:
|
| 609 |
+
value: null
|
| 610 |
+
inference_dynamic_batching_max_tokens_override:
|
| 611 |
+
value: null
|
| 612 |
+
inference_max_batch_size:
|
| 613 |
+
value: 8
|
| 614 |
+
inference_max_seq_length:
|
| 615 |
+
value: 2560
|
| 616 |
+
inference_rng_tracker:
|
| 617 |
+
value: false
|
| 618 |
+
init_method_std:
|
| 619 |
+
value: 0.02
|
| 620 |
+
init_method_xavier_uniform:
|
| 621 |
+
value: false
|
| 622 |
+
init_model_with_meta_device:
|
| 623 |
+
value: false
|
| 624 |
+
initial_loss_scale:
|
| 625 |
+
value: 4294967296
|
| 626 |
+
inprocess_active_world_size:
|
| 627 |
+
value: 8
|
| 628 |
+
inprocess_barrier_timeout:
|
| 629 |
+
value: 120
|
| 630 |
+
inprocess_completion_timeout:
|
| 631 |
+
value: 120
|
| 632 |
+
inprocess_empty_cuda_cache:
|
| 633 |
+
value: false
|
| 634 |
+
inprocess_granularity:
|
| 635 |
+
value: node
|
| 636 |
+
inprocess_hard_timeout:
|
| 637 |
+
value: 90
|
| 638 |
+
inprocess_heartbeat_interval:
|
| 639 |
+
value: 30
|
| 640 |
+
inprocess_heartbeat_timeout:
|
| 641 |
+
value: 60
|
| 642 |
+
inprocess_last_call_wait:
|
| 643 |
+
value: 1
|
| 644 |
+
inprocess_max_iterations:
|
| 645 |
+
value: null
|
| 646 |
+
inprocess_monitor_process_interval:
|
| 647 |
+
value: 1
|
| 648 |
+
inprocess_monitor_thread_interval:
|
| 649 |
+
value: 1
|
| 650 |
+
inprocess_progress_watchdog_interval:
|
| 651 |
+
value: 1
|
| 652 |
+
inprocess_restart:
|
| 653 |
+
value: false
|
| 654 |
+
inprocess_soft_timeout:
|
| 655 |
+
value: 60
|
| 656 |
+
inprocess_termination_grace_time:
|
| 657 |
+
value: 1
|
| 658 |
+
is_hybrid_model:
|
| 659 |
+
value: false
|
| 660 |
+
iter_per_epoch:
|
| 661 |
+
value: 1250
|
| 662 |
+
iterations_to_skip:
|
| 663 |
+
value: []
|
| 664 |
+
keep_fp8_transpose_cache_when_using_custom_fsdp:
|
| 665 |
+
value: false
|
| 666 |
+
kitchen_config_file:
|
| 667 |
+
value: null
|
| 668 |
+
kitchen_recipe_number:
|
| 669 |
+
value: null
|
| 670 |
+
kv_channels:
|
| 671 |
+
value: 128
|
| 672 |
+
kv_lora_rank:
|
| 673 |
+
value: 32
|
| 674 |
+
lazy_mpu_init:
|
| 675 |
+
value: null
|
| 676 |
+
load:
|
| 677 |
+
value: null
|
| 678 |
+
load_model_opt_format:
|
| 679 |
+
value: false
|
| 680 |
+
local_rank:
|
| 681 |
+
value: 7
|
| 682 |
+
log_energy:
|
| 683 |
+
value: false
|
| 684 |
+
log_interval:
|
| 685 |
+
value: 1
|
| 686 |
+
log_loss_scale_to_tensorboard:
|
| 687 |
+
value: true
|
| 688 |
+
log_memory_to_tensorboard:
|
| 689 |
+
value: true
|
| 690 |
+
log_num_zeros_in_grad:
|
| 691 |
+
value: false
|
| 692 |
+
log_params_norm:
|
| 693 |
+
value: false
|
| 694 |
+
log_progress:
|
| 695 |
+
value: false
|
| 696 |
+
log_straggler:
|
| 697 |
+
value: false
|
| 698 |
+
log_throughput:
|
| 699 |
+
value: true
|
| 700 |
+
log_timers_to_tensorboard:
|
| 701 |
+
value: true
|
| 702 |
+
log_validation_ppl_to_tensorboard:
|
| 703 |
+
value: true
|
| 704 |
+
log_world_size_to_tensorboard:
|
| 705 |
+
value: true
|
| 706 |
+
logging_level:
|
| 707 |
+
value: null
|
| 708 |
+
loss_scale:
|
| 709 |
+
value: null
|
| 710 |
+
loss_scale_window:
|
| 711 |
+
value: 1000
|
| 712 |
+
lr:
|
| 713 |
+
value: 0.0008
|
| 714 |
+
lr_decay_iters:
|
| 715 |
+
value: null
|
| 716 |
+
lr_decay_samples:
|
| 717 |
+
value: null
|
| 718 |
+
lr_decay_style:
|
| 719 |
+
value: cosine
|
| 720 |
+
lr_warmup_fraction:
|
| 721 |
+
value: null
|
| 722 |
+
lr_warmup_init:
|
| 723 |
+
value: 0
|
| 724 |
+
lr_warmup_iters:
|
| 725 |
+
value: 100
|
| 726 |
+
lr_warmup_samples:
|
| 727 |
+
value: 0
|
| 728 |
+
lr_wsd_decay_iters:
|
| 729 |
+
value: null
|
| 730 |
+
lr_wsd_decay_samples:
|
| 731 |
+
value: null
|
| 732 |
+
lr_wsd_decay_style:
|
| 733 |
+
value: exponential
|
| 734 |
+
main_grads_dtype:
|
| 735 |
+
value: torch.float32
|
| 736 |
+
main_params_dtype:
|
| 737 |
+
value: torch.float32
|
| 738 |
+
make_vocab_size_divisible_by:
|
| 739 |
+
value: 128
|
| 740 |
+
mamba_head_dim:
|
| 741 |
+
value: 64
|
| 742 |
+
mamba_num_groups:
|
| 743 |
+
value: 8
|
| 744 |
+
mamba_num_heads:
|
| 745 |
+
value: null
|
| 746 |
+
mamba_state_dim:
|
| 747 |
+
value: 128
|
| 748 |
+
manual_gc:
|
| 749 |
+
value: false
|
| 750 |
+
manual_gc_eval:
|
| 751 |
+
value: true
|
| 752 |
+
manual_gc_interval:
|
| 753 |
+
value: 0
|
| 754 |
+
mask_factor:
|
| 755 |
+
value: 1
|
| 756 |
+
mask_prob:
|
| 757 |
+
value: 0.15
|
| 758 |
+
mask_type:
|
| 759 |
+
value: random
|
| 760 |
+
masked_softmax_fusion:
|
| 761 |
+
value: false
|
| 762 |
+
max_position_embeddings:
|
| 763 |
+
value: 8192
|
| 764 |
+
max_tokens_to_oom:
|
| 765 |
+
value: 12000
|
| 766 |
+
memory_snapshot_path:
|
| 767 |
+
value: snapshot.pickle
|
| 768 |
+
merge_file:
|
| 769 |
+
value: null
|
| 770 |
+
micro_batch_size:
|
| 771 |
+
value: 8
|
| 772 |
+
microbatch_group_size_per_vp_stage:
|
| 773 |
+
value: null
|
| 774 |
+
mid_level_dataset_surplus:
|
| 775 |
+
value: 0.005
|
| 776 |
+
min_loss_scale:
|
| 777 |
+
value: 1
|
| 778 |
+
min_lr:
|
| 779 |
+
value: 4e-05
|
| 780 |
+
mlp_chunks_for_prefill:
|
| 781 |
+
value: 1
|
| 782 |
+
mmap_bin_files:
|
| 783 |
+
value: true
|
| 784 |
+
mock_data:
|
| 785 |
+
value: false
|
| 786 |
+
moe_apply_probs_on_input:
|
| 787 |
+
value: false
|
| 788 |
+
moe_aux_loss_coeff:
|
| 789 |
+
value: 0
|
| 790 |
+
moe_deepep_num_sms:
|
| 791 |
+
value: 20
|
| 792 |
+
moe_enable_deepep:
|
| 793 |
+
value: false
|
| 794 |
+
moe_expert_capacity_factor:
|
| 795 |
+
value: 1.25
|
| 796 |
+
moe_extended_tp:
|
| 797 |
+
value: false
|
| 798 |
+
moe_ffn_hidden_size:
|
| 799 |
+
value: 320
|
| 800 |
+
moe_grouped_gemm:
|
| 801 |
+
value: true
|
| 802 |
+
moe_input_jitter_eps:
|
| 803 |
+
value: null
|
| 804 |
+
moe_layer_freq:
|
| 805 |
+
value:
|
| 806 |
+
- 0
|
| 807 |
+
- 1
|
| 808 |
+
- 1
|
| 809 |
+
- 1
|
| 810 |
+
- 1
|
| 811 |
+
- 1
|
| 812 |
+
- 1
|
| 813 |
+
- 1
|
| 814 |
+
- 1
|
| 815 |
+
- 1
|
| 816 |
+
- 1
|
| 817 |
+
- 1
|
| 818 |
+
- 1
|
| 819 |
+
- 1
|
| 820 |
+
- 1
|
| 821 |
+
- 1
|
| 822 |
+
moe_layer_recompute:
|
| 823 |
+
value: false
|
| 824 |
+
moe_pad_expert_input_to_capacity:
|
| 825 |
+
value: true
|
| 826 |
+
moe_per_layer_logging:
|
| 827 |
+
value: false
|
| 828 |
+
moe_permute_fusion:
|
| 829 |
+
value: false
|
| 830 |
+
moe_router_bias_update_rate:
|
| 831 |
+
value: 0.001
|
| 832 |
+
moe_router_dtype:
|
| 833 |
+
value: fp32
|
| 834 |
+
moe_router_enable_expert_bias:
|
| 835 |
+
value: true
|
| 836 |
+
moe_router_force_load_balancing:
|
| 837 |
+
value: false
|
| 838 |
+
moe_router_group_topk:
|
| 839 |
+
value: 2
|
| 840 |
+
moe_router_load_balancing_type:
|
| 841 |
+
value: aux_loss
|
| 842 |
+
moe_router_num_groups:
|
| 843 |
+
value: 8
|
| 844 |
+
moe_router_padding_for_fp8:
|
| 845 |
+
value: false
|
| 846 |
+
moe_router_pre_softmax:
|
| 847 |
+
value: false
|
| 848 |
+
moe_router_score_function:
|
| 849 |
+
value: sigmoid
|
| 850 |
+
moe_router_topk:
|
| 851 |
+
value: 4
|
| 852 |
+
moe_router_topk_scaling_factor:
|
| 853 |
+
value: 2.5
|
| 854 |
+
moe_shared_expert_intermediate_size:
|
| 855 |
+
value: 320
|
| 856 |
+
moe_shared_expert_overlap:
|
| 857 |
+
value: true
|
| 858 |
+
moe_token_dispatcher_type:
|
| 859 |
+
value: alltoall
|
| 860 |
+
moe_token_drop_policy:
|
| 861 |
+
value: probs
|
| 862 |
+
moe_upcycling_granularity:
|
| 863 |
+
value: 1
|
| 864 |
+
moe_use_legacy_grouped_gemm:
|
| 865 |
+
value: false
|
| 866 |
+
moe_use_upcycling:
|
| 867 |
+
value: false
|
| 868 |
+
moe_z_loss_coeff:
|
| 869 |
+
value: 3.5e-06
|
| 870 |
+
mrope_section:
|
| 871 |
+
value: null
|
| 872 |
+
mscale:
|
| 873 |
+
value: 1
|
| 874 |
+
mscale_all_dim:
|
| 875 |
+
value: 1
|
| 876 |
+
mtp_loss_scaling_factor:
|
| 877 |
+
value: 0.1
|
| 878 |
+
mtp_num_layers:
|
| 879 |
+
value: 0
|
| 880 |
+
multi_latent_attention:
|
| 881 |
+
value: false
|
| 882 |
+
nccl_all_reduce_for_prefill:
|
| 883 |
+
value: false
|
| 884 |
+
nccl_communicator_config_path:
|
| 885 |
+
value: null
|
| 886 |
+
nccl_ub:
|
| 887 |
+
value: false
|
| 888 |
+
no_load_optim:
|
| 889 |
+
value: null
|
| 890 |
+
no_load_rng:
|
| 891 |
+
value: null
|
| 892 |
+
no_persist_layer_norm:
|
| 893 |
+
value: false
|
| 894 |
+
no_rope_freq:
|
| 895 |
+
value: null
|
| 896 |
+
no_save_optim:
|
| 897 |
+
value: null
|
| 898 |
+
no_save_rng:
|
| 899 |
+
value: null
|
| 900 |
+
non_persistent_ckpt_type:
|
| 901 |
+
value: null
|
| 902 |
+
non_persistent_global_ckpt_dir:
|
| 903 |
+
value: null
|
| 904 |
+
non_persistent_local_ckpt_algo:
|
| 905 |
+
value: fully_parallel
|
| 906 |
+
non_persistent_local_ckpt_dir:
|
| 907 |
+
value: null
|
| 908 |
+
non_persistent_save_interval:
|
| 909 |
+
value: null
|
| 910 |
+
norm_epsilon:
|
| 911 |
+
value: 1e-06
|
| 912 |
+
normalization:
|
| 913 |
+
value: RMSNorm
|
| 914 |
+
num_attention_heads:
|
| 915 |
+
value: 16
|
| 916 |
+
num_channels:
|
| 917 |
+
value: 3
|
| 918 |
+
num_classes:
|
| 919 |
+
value: 1000
|
| 920 |
+
num_dataset_builder_threads:
|
| 921 |
+
value: 1
|
| 922 |
+
num_distributed_optimizer_instances:
|
| 923 |
+
value: 1
|
| 924 |
+
num_experts:
|
| 925 |
+
value: 128
|
| 926 |
+
num_layers:
|
| 927 |
+
value: 16
|
| 928 |
+
num_layers_at_end_in_bf16:
|
| 929 |
+
value: 1
|
| 930 |
+
num_layers_at_start_in_bf16:
|
| 931 |
+
value: 1
|
| 932 |
+
num_layers_per_virtual_pipeline_stage:
|
| 933 |
+
value: null
|
| 934 |
+
num_query_groups:
|
| 935 |
+
value: 4
|
| 936 |
+
num_virtual_stages_per_pipeline_rank:
|
| 937 |
+
value: null
|
| 938 |
+
num_workers:
|
| 939 |
+
value: 2
|
| 940 |
+
object_storage_cache_path:
|
| 941 |
+
value: null
|
| 942 |
+
one_logger_async:
|
| 943 |
+
value: false
|
| 944 |
+
one_logger_project:
|
| 945 |
+
value: megatron-lm
|
| 946 |
+
one_logger_run_name:
|
| 947 |
+
value: null
|
| 948 |
+
onnx_safe:
|
| 949 |
+
value: null
|
| 950 |
+
openai_gelu:
|
| 951 |
+
value: false
|
| 952 |
+
optimizer:
|
| 953 |
+
value: adam
|
| 954 |
+
optimizer_cpu_offload:
|
| 955 |
+
value: false
|
| 956 |
+
optimizer_offload_fraction:
|
| 957 |
+
value: 1
|
| 958 |
+
output_bert_embeddings:
|
| 959 |
+
value: false
|
| 960 |
+
overlap_cpu_optimizer_d2h_h2d:
|
| 961 |
+
value: false
|
| 962 |
+
overlap_grad_reduce:
|
| 963 |
+
value: true
|
| 964 |
+
overlap_p2p_comm:
|
| 965 |
+
value: false
|
| 966 |
+
overlap_p2p_comm_warmup_flush:
|
| 967 |
+
value: false
|
| 968 |
+
overlap_param_gather:
|
| 969 |
+
value: false
|
| 970 |
+
overlap_param_gather_with_optimizer_step:
|
| 971 |
+
value: false
|
| 972 |
+
override_opt_param_scheduler:
|
| 973 |
+
value: false
|
| 974 |
+
padded_vocab_size:
|
| 975 |
+
value: 32256
|
| 976 |
+
params_dtype:
|
| 977 |
+
value: torch.bfloat16
|
| 978 |
+
patch_dim:
|
| 979 |
+
value: 16
|
| 980 |
+
per_split_data_args_path:
|
| 981 |
+
value: null
|
| 982 |
+
perform_initialization:
|
| 983 |
+
value: true
|
| 984 |
+
pin_cpu_grads:
|
| 985 |
+
value: true
|
| 986 |
+
pin_cpu_params:
|
| 987 |
+
value: true
|
| 988 |
+
pipeline_model_parallel_comm_backend:
|
| 989 |
+
value: null
|
| 990 |
+
pipeline_model_parallel_layout:
|
| 991 |
+
value: null
|
| 992 |
+
pipeline_model_parallel_size:
|
| 993 |
+
value: 1
|
| 994 |
+
pipeline_model_parallel_split_rank:
|
| 995 |
+
value: null
|
| 996 |
+
position_embedding_type:
|
| 997 |
+
value: rope
|
| 998 |
+
pretrained_checkpoint:
|
| 999 |
+
value: null
|
| 1000 |
+
profile:
|
| 1001 |
+
value: false
|
| 1002 |
+
profile_ranks:
|
| 1003 |
+
value:
|
| 1004 |
+
- 0
|
| 1005 |
+
profile_step_end:
|
| 1006 |
+
value: 12
|
| 1007 |
+
profile_step_start:
|
| 1008 |
+
value: 10
|
| 1009 |
+
q_lora_rank:
|
| 1010 |
+
value: null
|
| 1011 |
+
qk_head_dim:
|
| 1012 |
+
value: 128
|
| 1013 |
+
qk_l2_norm:
|
| 1014 |
+
value: false
|
| 1015 |
+
qk_layernorm:
|
| 1016 |
+
value: true
|
| 1017 |
+
qk_pos_emb_head_dim:
|
| 1018 |
+
value: 64
|
| 1019 |
+
query_in_block_prob:
|
| 1020 |
+
value: 0.1
|
| 1021 |
+
rampup_batch_size:
|
| 1022 |
+
value: null
|
| 1023 |
+
rank:
|
| 1024 |
+
value: 7
|
| 1025 |
+
recompute_granularity:
|
| 1026 |
+
value: null
|
| 1027 |
+
recompute_method:
|
| 1028 |
+
value: null
|
| 1029 |
+
recompute_modules:
|
| 1030 |
+
value: null
|
| 1031 |
+
recompute_num_layers:
|
| 1032 |
+
value: null
|
| 1033 |
+
record_memory_history:
|
| 1034 |
+
value: false
|
| 1035 |
+
relative_attention_max_distance:
|
| 1036 |
+
value: 128
|
| 1037 |
+
relative_attention_num_buckets:
|
| 1038 |
+
value: 32
|
| 1039 |
+
replication:
|
| 1040 |
+
value: false
|
| 1041 |
+
replication_factor:
|
| 1042 |
+
value: 2
|
| 1043 |
+
replication_jump:
|
| 1044 |
+
value: null
|
| 1045 |
+
rerun_mode:
|
| 1046 |
+
value: disabled
|
| 1047 |
+
reset_attention_mask:
|
| 1048 |
+
value: false
|
| 1049 |
+
reset_position_ids:
|
| 1050 |
+
value: false
|
| 1051 |
+
result_rejected_tracker_filename:
|
| 1052 |
+
value: null
|
| 1053 |
+
retriever_report_topk_accuracies:
|
| 1054 |
+
value: []
|
| 1055 |
+
retriever_score_scaling:
|
| 1056 |
+
value: false
|
| 1057 |
+
retriever_seq_length:
|
| 1058 |
+
value: 256
|
| 1059 |
+
retro_add_retriever:
|
| 1060 |
+
value: false
|
| 1061 |
+
retro_attention_gate:
|
| 1062 |
+
value: 1
|
| 1063 |
+
retro_cyclic_train_iters:
|
| 1064 |
+
value: null
|
| 1065 |
+
retro_encoder_attention_dropout:
|
| 1066 |
+
value: 0.1
|
| 1067 |
+
retro_encoder_hidden_dropout:
|
| 1068 |
+
value: 0.1
|
| 1069 |
+
retro_encoder_layers:
|
| 1070 |
+
value: 2
|
| 1071 |
+
retro_num_neighbors:
|
| 1072 |
+
value: 2
|
| 1073 |
+
retro_num_retrieved_chunks:
|
| 1074 |
+
value: 2
|
| 1075 |
+
retro_project_dir:
|
| 1076 |
+
value: null
|
| 1077 |
+
retro_verify_neighbor_count:
|
| 1078 |
+
value: true
|
| 1079 |
+
reuse_grad_buf_for_mxfp8_param_ag:
|
| 1080 |
+
value: false
|
| 1081 |
+
rope_scaling_factor:
|
| 1082 |
+
value: 8
|
| 1083 |
+
rotary_base:
|
| 1084 |
+
value: 84000
|
| 1085 |
+
rotary_interleaved:
|
| 1086 |
+
value: false
|
| 1087 |
+
rotary_percent:
|
| 1088 |
+
value: 0.5
|
| 1089 |
+
rotary_scaling_factor:
|
| 1090 |
+
value: 40
|
| 1091 |
+
rotary_seq_len_interpolation_factor:
|
| 1092 |
+
value: null
|
| 1093 |
+
run_workload_inspector_server:
|
| 1094 |
+
value: false
|
| 1095 |
+
sample_rate:
|
| 1096 |
+
value: 1
|
| 1097 |
+
save:
|
| 1098 |
+
value: poziomka_5
|
| 1099 |
+
save_interval:
|
| 1100 |
+
value: 1600
|
| 1101 |
+
scatter_gather_tensors_in_pipeline:
|
| 1102 |
+
value: true
|
| 1103 |
+
seed:
|
| 1104 |
+
value: 50
|
| 1105 |
+
seq_length:
|
| 1106 |
+
value: 8192
|
| 1107 |
+
sequence_parallel:
|
| 1108 |
+
value: true
|
| 1109 |
+
sft:
|
| 1110 |
+
value: false
|
| 1111 |
+
sft_tokenizer_prompt_format:
|
| 1112 |
+
value: nemotron-h-aligned
|
| 1113 |
+
sgd_momentum:
|
| 1114 |
+
value: 0.9
|
| 1115 |
+
short_seq_prob:
|
| 1116 |
+
value: 0.1
|
| 1117 |
+
skip_casting_dtype_for_param_pattern:
|
| 1118 |
+
value: '["^expert_bias$|.+\.expert_bias$"]'
|
| 1119 |
+
skip_train:
|
| 1120 |
+
value: false
|
| 1121 |
+
skipped_train_samples:
|
| 1122 |
+
value: 0
|
| 1123 |
+
spec:
|
| 1124 |
+
value: null
|
| 1125 |
+
split:
|
| 1126 |
+
value: 9999,1,0
|
| 1127 |
+
squared_relu:
|
| 1128 |
+
value: false
|
| 1129 |
+
start_weight_decay:
|
| 1130 |
+
value: 0.1
|
| 1131 |
+
straggler_ctrlr_port:
|
| 1132 |
+
value: 65535
|
| 1133 |
+
straggler_minmax_count:
|
| 1134 |
+
value: 1
|
| 1135 |
+
suggested_communication_unit_size:
|
| 1136 |
+
value: null
|
| 1137 |
+
swiglu:
|
| 1138 |
+
value: true
|
| 1139 |
+
swin_backbone_type:
|
| 1140 |
+
value: tiny
|
| 1141 |
+
symmetric_ar_type:
|
| 1142 |
+
value: null
|
| 1143 |
+
te_rng_tracker:
|
| 1144 |
+
value: false
|
| 1145 |
+
tensor_model_parallel_size:
|
| 1146 |
+
value: 4
|
| 1147 |
+
tensorboard_dir:
|
| 1148 |
+
value: poziomka_5/runs
|
| 1149 |
+
tensorboard_log_interval:
|
| 1150 |
+
value: 1
|
| 1151 |
+
tensorboard_queue_size:
|
| 1152 |
+
value: 1000
|
| 1153 |
+
test_data_path:
|
| 1154 |
+
value: null
|
| 1155 |
+
test_mode:
|
| 1156 |
+
value: false
|
| 1157 |
+
tiktoken_num_special_tokens:
|
| 1158 |
+
value: 1000
|
| 1159 |
+
tiktoken_pattern:
|
| 1160 |
+
value: null
|
| 1161 |
+
tiktoken_special_tokens:
|
| 1162 |
+
value: null
|
| 1163 |
+
timing_log_level:
|
| 1164 |
+
value: 0
|
| 1165 |
+
timing_log_option:
|
| 1166 |
+
value: minmax
|
| 1167 |
+
titles_data_path:
|
| 1168 |
+
value: null
|
| 1169 |
+
tokenizer_model:
|
| 1170 |
+
value: /home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4
|
| 1171 |
+
tokenizer_type:
|
| 1172 |
+
value: HuggingFaceTokenizer
|
| 1173 |
+
torch_fsdp2_reshard_after_forward:
|
| 1174 |
+
value: true
|
| 1175 |
+
tp_comm_bootstrap_backend:
|
| 1176 |
+
value: nccl
|
| 1177 |
+
tp_comm_bulk_dgrad:
|
| 1178 |
+
value: true
|
| 1179 |
+
tp_comm_bulk_wgrad:
|
| 1180 |
+
value: true
|
| 1181 |
+
tp_comm_overlap:
|
| 1182 |
+
value: false
|
| 1183 |
+
tp_comm_overlap_ag:
|
| 1184 |
+
value: true
|
| 1185 |
+
tp_comm_overlap_cfg:
|
| 1186 |
+
value: null
|
| 1187 |
+
tp_comm_overlap_rs:
|
| 1188 |
+
value: true
|
| 1189 |
+
tp_comm_overlap_rs_dgrad:
|
| 1190 |
+
value: false
|
| 1191 |
+
tp_comm_split_ag:
|
| 1192 |
+
value: true
|
| 1193 |
+
tp_comm_split_rs:
|
| 1194 |
+
value: true
|
| 1195 |
+
train_data_path:
|
| 1196 |
+
value: null
|
| 1197 |
+
train_iters:
|
| 1198 |
+
value: 50000
|
| 1199 |
+
train_samples:
|
| 1200 |
+
value: null
|
| 1201 |
+
train_sync_interval:
|
| 1202 |
+
value: null
|
| 1203 |
+
transformer_impl:
|
| 1204 |
+
value: transformer_engine
|
| 1205 |
+
transformer_pipeline_model_parallel_size:
|
| 1206 |
+
value: 1
|
| 1207 |
+
untie_embeddings_and_output_weights:
|
| 1208 |
+
value: true
|
| 1209 |
+
use_checkpoint_args:
|
| 1210 |
+
value: false
|
| 1211 |
+
use_checkpoint_opt_param_scheduler:
|
| 1212 |
+
value: false
|
| 1213 |
+
use_cpu_initialization:
|
| 1214 |
+
value: null
|
| 1215 |
+
use_custom_fsdp:
|
| 1216 |
+
value: false
|
| 1217 |
+
use_dist_ckpt:
|
| 1218 |
+
value: true
|
| 1219 |
+
use_dist_ckpt_deprecated:
|
| 1220 |
+
value: false
|
| 1221 |
+
use_distributed_optimizer:
|
| 1222 |
+
value: false
|
| 1223 |
+
use_flash_attn:
|
| 1224 |
+
value: true
|
| 1225 |
+
use_legacy_models:
|
| 1226 |
+
value: false
|
| 1227 |
+
use_mp_args_from_checkpoint_args:
|
| 1228 |
+
value: false
|
| 1229 |
+
use_one_sent_docs:
|
| 1230 |
+
value: false
|
| 1231 |
+
use_persistent_ckpt_worker:
|
| 1232 |
+
value: false
|
| 1233 |
+
use_precision_aware_optimizer:
|
| 1234 |
+
value: false
|
| 1235 |
+
use_pytorch_profiler:
|
| 1236 |
+
value: false
|
| 1237 |
+
use_ring_exchange_p2p:
|
| 1238 |
+
value: false
|
| 1239 |
+
use_rope_scaling:
|
| 1240 |
+
value: false
|
| 1241 |
+
use_rotary_position_embeddings:
|
| 1242 |
+
value: false
|
| 1243 |
+
use_sharp:
|
| 1244 |
+
value: false
|
| 1245 |
+
use_tokenizer_model_from_checkpoint_args:
|
| 1246 |
+
value: true
|
| 1247 |
+
use_torch_fsdp2:
|
| 1248 |
+
value: false
|
| 1249 |
+
use_torch_optimizer_for_cpu_offload:
|
| 1250 |
+
value: false
|
| 1251 |
+
use_tp_pp_dp_mapping:
|
| 1252 |
+
value: false
|
| 1253 |
+
v_head_dim:
|
| 1254 |
+
value: 128
|
| 1255 |
+
valid_data_path:
|
| 1256 |
+
value: null
|
| 1257 |
+
variable_seq_lengths:
|
| 1258 |
+
value: false
|
| 1259 |
+
virtual_pipeline_model_parallel_size:
|
| 1260 |
+
value: null
|
| 1261 |
+
vision_backbone_type:
|
| 1262 |
+
value: vit
|
| 1263 |
+
vision_pretraining:
|
| 1264 |
+
value: false
|
| 1265 |
+
vision_pretraining_type:
|
| 1266 |
+
value: classify
|
| 1267 |
+
vocab_extra_ids:
|
| 1268 |
+
value: 0
|
| 1269 |
+
vocab_file:
|
| 1270 |
+
value: null
|
| 1271 |
+
vocab_size:
|
| 1272 |
+
value: 32000
|
| 1273 |
+
wandb_exp_name:
|
| 1274 |
+
value: poziomka_5
|
| 1275 |
+
wandb_project:
|
| 1276 |
+
value: poziomka
|
| 1277 |
+
wandb_save_dir:
|
| 1278 |
+
value: ""
|
| 1279 |
+
weight_decay:
|
| 1280 |
+
value: 0.1
|
| 1281 |
+
weight_decay_incr_style:
|
| 1282 |
+
value: constant
|
| 1283 |
+
wgrad_deferral_limit:
|
| 1284 |
+
value: 0
|
| 1285 |
+
world_size:
|
| 1286 |
+
value: 8
|
| 1287 |
+
yaml_cfg:
|
| 1288 |
+
value: null
|
wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:184173ebbe892d0082a25f8f9c003dff5ca6df791cee3e7dfa216517dc2a9dbd
|
| 3 |
+
size 15656694
|
wandb/wandb/run-20250922_220405-hrldy3bw/files/requirements.txt
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GitPython==3.1.45
|
| 2 |
+
psutils==3.3.11
|
| 3 |
+
networkx==3.3
|
| 4 |
+
dill==0.4.0
|
| 5 |
+
requests==2.32.5
|
| 6 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 7 |
+
ml_dtypes==0.5.3
|
| 8 |
+
pyarrow==21.0.0
|
| 9 |
+
gitdb==4.0.12
|
| 10 |
+
packaging==24.2
|
| 11 |
+
pydantic_core==2.33.2
|
| 12 |
+
torchvision==0.23.0+cu129
|
| 13 |
+
mpmath==1.3.0
|
| 14 |
+
nvidia-cusolver-cu12==11.7.5.82
|
| 15 |
+
nvidia-cuda-runtime-cu12==12.9.79
|
| 16 |
+
propcache==0.3.2
|
| 17 |
+
psutil==7.1.0
|
| 18 |
+
onnx-ir==0.1.9
|
| 19 |
+
nvidia-cusparse-cu12==12.5.10.65
|
| 20 |
+
aiohttp==3.12.15
|
| 21 |
+
aiosignal==1.4.0
|
| 22 |
+
protobuf==6.32.1
|
| 23 |
+
apex==0.1
|
| 24 |
+
torch==2.8.0+cu129
|
| 25 |
+
nvidia-cublas-cu12==12.9.1.4
|
| 26 |
+
frozenlist==1.7.0
|
| 27 |
+
nvidia-cufile-cu12==1.14.1.1
|
| 28 |
+
onnxscript==0.3.1
|
| 29 |
+
smmap==5.0.2
|
| 30 |
+
nvidia-cuda-nvrtc-cu12==12.9.86
|
| 31 |
+
pandas==2.3.2
|
| 32 |
+
platformdirs==4.4.0
|
| 33 |
+
nvidia-nvjitlink-cu12==12.9.86
|
| 34 |
+
pypdf==6.1.0
|
| 35 |
+
puremagic==1.30
|
| 36 |
+
regex==2025.9.18
|
| 37 |
+
triton==3.4.0
|
| 38 |
+
pip==25.2
|
| 39 |
+
pydantic==2.11.9
|
| 40 |
+
charset-normalizer==3.4.3
|
| 41 |
+
nvidia-cufft-cu12==11.4.1.4
|
| 42 |
+
urllib3==2.5.0
|
| 43 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 44 |
+
tzdata==2025.2
|
| 45 |
+
wandb==0.22.0
|
| 46 |
+
datasets==4.1.1
|
| 47 |
+
huggingface-hub==0.35.0
|
| 48 |
+
transformers==4.56.2
|
| 49 |
+
tqdm==4.67.1
|
| 50 |
+
megatron-core==0.13.0
|
| 51 |
+
tiktoken==0.11.0
|
| 52 |
+
hf_transfer==0.1.9
|
| 53 |
+
multiprocess==0.70.16
|
| 54 |
+
python-dateutil==2.9.0.post0
|
| 55 |
+
multidict==6.6.4
|
| 56 |
+
sentry-sdk==2.38.0
|
| 57 |
+
aiohappyeyeballs==2.6.1
|
| 58 |
+
onnx==1.19.0
|
| 59 |
+
einops==0.8.1
|
| 60 |
+
sympy==1.13.3
|
| 61 |
+
setuptools==80.9.0
|
| 62 |
+
pillow==11.0.0
|
| 63 |
+
filelock==3.19.1
|
| 64 |
+
hf-xet==1.1.10
|
| 65 |
+
flash_attn_3==3.0.0b1
|
| 66 |
+
ninja==1.13.0
|
| 67 |
+
fsspec==2025.9.0
|
| 68 |
+
nvidia-curand-cu12==10.3.10.19
|
| 69 |
+
bitsandbytes==0.47.0
|
| 70 |
+
nvidia-nccl-cu12==2.27.3
|
| 71 |
+
typing-inspection==0.4.1
|
| 72 |
+
xxhash==3.5.0
|
| 73 |
+
numpy==1.26.4
|
| 74 |
+
tokenizers==0.22.1
|
| 75 |
+
typing_extensions==4.15.0
|
| 76 |
+
safetensors==0.6.2
|
| 77 |
+
annotated-types==0.7.0
|
| 78 |
+
transformer_engine==2.6.0.post1
|
| 79 |
+
nvidia-nvtx-cu12==12.9.79
|
| 80 |
+
async-timeout==5.0.1
|
| 81 |
+
transformer_engine_cu12==2.6.0.post1
|
| 82 |
+
transformer_engine_torch==2.6.0.post1
|
| 83 |
+
nvidia-cuda-cupti-cu12==12.9.79
|
| 84 |
+
wheel==0.45.1
|
| 85 |
+
yarl==1.20.1
|
| 86 |
+
pybind11==3.0.1
|
| 87 |
+
python-debian==0.1.43+ubuntu1.1
|
| 88 |
+
SecretStorage==3.3.1
|
| 89 |
+
lazr.restfulclient==0.14.4
|
| 90 |
+
pytz==2022.1
|
| 91 |
+
attrs==21.2.0
|
| 92 |
+
zope.interface==5.4.0
|
| 93 |
+
chardet==4.0.0
|
| 94 |
+
pyasn1-modules==0.2.1
|
| 95 |
+
setuptools==59.6.0
|
| 96 |
+
Jinja2==3.0.3
|
| 97 |
+
pyasn1==0.4.8
|
| 98 |
+
netifaces==0.11.0
|
| 99 |
+
ubuntu-drivers-common==0.0.0
|
| 100 |
+
click==8.0.3
|
| 101 |
+
dbus-python==1.2.18
|
| 102 |
+
pyserial==3.5
|
| 103 |
+
python-apt==2.4.0+ubuntu4
|
| 104 |
+
PyJWT==2.3.0
|
| 105 |
+
oauthlib==3.2.0
|
| 106 |
+
bcrypt==3.2.0
|
| 107 |
+
python-magic==0.4.24
|
| 108 |
+
xkit==0.0.0
|
| 109 |
+
constantly==15.1.0
|
| 110 |
+
blinker==1.4
|
| 111 |
+
PyYAML==5.4.1
|
| 112 |
+
distro-info==1.1+ubuntu0.2
|
| 113 |
+
lazr.uri==1.0.6
|
| 114 |
+
distro==1.7.0
|
| 115 |
+
pexpect==4.8.0
|
| 116 |
+
PyGObject==3.42.1
|
| 117 |
+
ssh-import-id==5.11
|
| 118 |
+
cryptography==3.4.8
|
| 119 |
+
certifi==2020.6.20
|
| 120 |
+
service-identity==18.1.0
|
| 121 |
+
cloud-init==25.1.2
|
| 122 |
+
keyring==23.5.0
|
| 123 |
+
jeepney==0.7.1
|
| 124 |
+
colorama==0.4.4
|
| 125 |
+
idna==3.3
|
| 126 |
+
MarkupSafe==2.0.1
|
| 127 |
+
pip==22.0.2
|
| 128 |
+
ptyprocess==0.7.0
|
| 129 |
+
configobj==5.0.6
|
| 130 |
+
hyperlink==21.0.0
|
| 131 |
+
pyparsing==2.4.7
|
| 132 |
+
ufw==0.36.1
|
| 133 |
+
pyrsistent==0.18.1
|
| 134 |
+
httplib2==0.20.2
|
| 135 |
+
sos==4.8.2
|
| 136 |
+
unattended-upgrades==0.1
|
| 137 |
+
requests==2.25.1
|
| 138 |
+
ubuntu-pro-client==8001
|
| 139 |
+
launchpadlib==1.10.16
|
| 140 |
+
six==1.16.0
|
| 141 |
+
urllib3==1.26.5
|
| 142 |
+
systemd-python==234
|
| 143 |
+
importlib-metadata==4.6.4
|
| 144 |
+
command-not-found==0.3
|
| 145 |
+
jsonschema==3.2.0
|
| 146 |
+
Automat==20.2.0
|
| 147 |
+
more-itertools==8.10.0
|
| 148 |
+
PyHamcrest==2.0.2
|
| 149 |
+
incremental==21.3.0
|
| 150 |
+
zipp==1.0.0
|
| 151 |
+
jsonpointer==2.0
|
| 152 |
+
Twisted==22.1.0
|
| 153 |
+
pyOpenSSL==21.0.0
|
| 154 |
+
wadllib==1.3.6
|
| 155 |
+
Babel==2.8.0
|
| 156 |
+
jsonpatch==1.32
|
| 157 |
+
wheel==0.37.1
|
| 158 |
+
platformdirs==4.2.2
|
| 159 |
+
typing_extensions==4.12.2
|
| 160 |
+
packaging==24.2
|
| 161 |
+
tomli==2.0.1
|
| 162 |
+
inflect==7.3.1
|
| 163 |
+
jaraco.context==5.3.0
|
| 164 |
+
backports.tarfile==1.2.0
|
| 165 |
+
autocommand==2.2.2
|
| 166 |
+
importlib_metadata==8.0.0
|
| 167 |
+
more-itertools==10.3.0
|
| 168 |
+
jaraco.functools==4.0.1
|
| 169 |
+
typeguard==4.3.0
|
| 170 |
+
zipp==3.19.2
|
| 171 |
+
jaraco.collections==5.1.0
|
| 172 |
+
jaraco.text==3.12.1
|
| 173 |
+
wheel==0.45.1
|
wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-metadata.json
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"os": "Linux-5.15.0-143-generic-x86_64-with-glibc2.35",
|
| 3 |
+
"python": "CPython 3.10.12",
|
| 4 |
+
"startedAt": "2025-09-22T22:04:05.928052Z",
|
| 5 |
+
"args": [
|
| 6 |
+
"--expert-model-parallel-size",
|
| 7 |
+
"2",
|
| 8 |
+
"--expert-tensor-parallel-size",
|
| 9 |
+
"1",
|
| 10 |
+
"--moe-grouped-gemm",
|
| 11 |
+
"--moe-token-dispatcher-type",
|
| 12 |
+
"alltoall",
|
| 13 |
+
"--moe-router-dtype",
|
| 14 |
+
"fp32",
|
| 15 |
+
"--num-experts",
|
| 16 |
+
"128",
|
| 17 |
+
"--moe-ffn-hidden-size",
|
| 18 |
+
"320",
|
| 19 |
+
"--moe-shared-expert-intermediate-size",
|
| 20 |
+
"320",
|
| 21 |
+
"--moe-router-score-function",
|
| 22 |
+
"sigmoid",
|
| 23 |
+
"--moe-router-topk",
|
| 24 |
+
"4",
|
| 25 |
+
"--moe-router-enable-expert-bias",
|
| 26 |
+
"--moe-router-topk-scaling-factor",
|
| 27 |
+
"2.5",
|
| 28 |
+
"--moe-router-num-groups",
|
| 29 |
+
"8",
|
| 30 |
+
"--moe-router-group-topk",
|
| 31 |
+
"2",
|
| 32 |
+
"--moe-z-loss-coeff",
|
| 33 |
+
"0.0000035",
|
| 34 |
+
"--moe-router-bias-update-rate",
|
| 35 |
+
"1e-3",
|
| 36 |
+
"--moe-layer-freq",
|
| 37 |
+
"[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]",
|
| 38 |
+
"--bias-zero-mean-update",
|
| 39 |
+
"--moe-expert-capacity-factor",
|
| 40 |
+
"1.25",
|
| 41 |
+
"--moe-pad-expert-input-to-capacity",
|
| 42 |
+
"--moe-shared-expert-overlap",
|
| 43 |
+
"--num-layers",
|
| 44 |
+
"16",
|
| 45 |
+
"--hidden-size",
|
| 46 |
+
"2048",
|
| 47 |
+
"--ffn-hidden-size",
|
| 48 |
+
"2048",
|
| 49 |
+
"--num-attention-heads",
|
| 50 |
+
"16",
|
| 51 |
+
"--num-query-groups",
|
| 52 |
+
"4",
|
| 53 |
+
"--group-query-attention",
|
| 54 |
+
"--qk-layernorm",
|
| 55 |
+
"--use-flash-attn",
|
| 56 |
+
"--max-position-embeddings",
|
| 57 |
+
"8192",
|
| 58 |
+
"--vocab-size",
|
| 59 |
+
"32000",
|
| 60 |
+
"--make-vocab-size-divisible-by",
|
| 61 |
+
"128",
|
| 62 |
+
"--position-embedding-type",
|
| 63 |
+
"rope",
|
| 64 |
+
"--rotary-base",
|
| 65 |
+
"84000",
|
| 66 |
+
"--rotary-percent",
|
| 67 |
+
"0.5",
|
| 68 |
+
"--rotary-scaling-factor",
|
| 69 |
+
"40",
|
| 70 |
+
"--swiglu",
|
| 71 |
+
"--untie-embeddings-and-output-weights",
|
| 72 |
+
"--normalization",
|
| 73 |
+
"RMSNorm",
|
| 74 |
+
"--norm-epsilon",
|
| 75 |
+
"1e-06",
|
| 76 |
+
"--disable-bias-linear",
|
| 77 |
+
"--transformer-impl",
|
| 78 |
+
"transformer_engine",
|
| 79 |
+
"--attention-dropout",
|
| 80 |
+
"0",
|
| 81 |
+
"--hidden-dropout",
|
| 82 |
+
"0",
|
| 83 |
+
"--micro-batch-size",
|
| 84 |
+
"8",
|
| 85 |
+
"--global-batch-size",
|
| 86 |
+
"256",
|
| 87 |
+
"--seq-length",
|
| 88 |
+
"8192",
|
| 89 |
+
"--train-iters",
|
| 90 |
+
"50000",
|
| 91 |
+
"--weight-decay",
|
| 92 |
+
"0.1",
|
| 93 |
+
"--adam-beta1",
|
| 94 |
+
"0.9",
|
| 95 |
+
"--adam-beta2",
|
| 96 |
+
"0.95",
|
| 97 |
+
"--init-method-std",
|
| 98 |
+
"0.02",
|
| 99 |
+
"--clip-grad",
|
| 100 |
+
"1.0",
|
| 101 |
+
"--bf16",
|
| 102 |
+
"--optimizer",
|
| 103 |
+
"adam",
|
| 104 |
+
"--lr",
|
| 105 |
+
"8.0e-4",
|
| 106 |
+
"--lr-decay-style",
|
| 107 |
+
"cosine",
|
| 108 |
+
"--min-lr",
|
| 109 |
+
"4.00e-5",
|
| 110 |
+
"--lr-warmup-iters",
|
| 111 |
+
"100",
|
| 112 |
+
"--seed",
|
| 113 |
+
"50",
|
| 114 |
+
"--pipeline-model-parallel-size",
|
| 115 |
+
"1",
|
| 116 |
+
"--tensor-model-parallel-size",
|
| 117 |
+
"4",
|
| 118 |
+
"--sequence-parallel",
|
| 119 |
+
"--overlap-grad-reduce",
|
| 120 |
+
"--data-path",
|
| 121 |
+
"szypulka_tokenized_apt4_merged/apt4_merged_text_document",
|
| 122 |
+
"--tokenizer-type",
|
| 123 |
+
"HuggingFaceTokenizer",
|
| 124 |
+
"--tokenizer-model",
|
| 125 |
+
"/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4",
|
| 126 |
+
"--split",
|
| 127 |
+
"9999,1,0",
|
| 128 |
+
"--dataloader-type",
|
| 129 |
+
"single",
|
| 130 |
+
"--no-create-attention-mask-in-dataloader",
|
| 131 |
+
"--eod-mask-loss",
|
| 132 |
+
"--save-interval",
|
| 133 |
+
"1600",
|
| 134 |
+
"--eval-interval",
|
| 135 |
+
"1600",
|
| 136 |
+
"--eval-iters",
|
| 137 |
+
"2",
|
| 138 |
+
"--save",
|
| 139 |
+
"poziomka_5",
|
| 140 |
+
"--ckpt-format",
|
| 141 |
+
"torch_dist",
|
| 142 |
+
"--async-save",
|
| 143 |
+
"--log-interval",
|
| 144 |
+
"1",
|
| 145 |
+
"--log-throughput",
|
| 146 |
+
"--tensorboard-dir",
|
| 147 |
+
"poziomka_5/runs",
|
| 148 |
+
"--log-timers-to-tensorboard",
|
| 149 |
+
"--log-memory-to-tensorboard",
|
| 150 |
+
"--log-world-size-to-tensorboard",
|
| 151 |
+
"--log-validation-ppl-to-tensorboard",
|
| 152 |
+
"--wandb-project",
|
| 153 |
+
"poziomka",
|
| 154 |
+
"--wandb-exp-name",
|
| 155 |
+
"poziomka_5",
|
| 156 |
+
"--attention-backend",
|
| 157 |
+
"flash",
|
| 158 |
+
"--no-masked-softmax-fusion",
|
| 159 |
+
"--attention-softmax-in-fp32",
|
| 160 |
+
"--cross-entropy-loss-fusion",
|
| 161 |
+
"--mtp-num-layers",
|
| 162 |
+
"0"
|
| 163 |
+
],
|
| 164 |
+
"program": "/home/ubuntu/training/Ling-V2/Megatron-LM-core_v0.13.0/pretrain_gpt.py",
|
| 165 |
+
"codePath": "Megatron-LM-core_v0.13.0/pretrain_gpt.py",
|
| 166 |
+
"codePathLocal": "Megatron-LM-core_v0.13.0/pretrain_gpt.py",
|
| 167 |
+
"git": {
|
| 168 |
+
"remote": "https://github.com/adamo1139/Ling-V2.git",
|
| 169 |
+
"commit": "e3867293ebf444f614164c2b84180cd75e7de07c"
|
| 170 |
+
},
|
| 171 |
+
"email": "adamo1139@gmail.com",
|
| 172 |
+
"root": "poziomka_5/wandb",
|
| 173 |
+
"host": "megatron6",
|
| 174 |
+
"executable": "/usr/bin/python3",
|
| 175 |
+
"cpu_count": 128,
|
| 176 |
+
"cpu_count_logical": 128,
|
| 177 |
+
"gpu": "NVIDIA H100 80GB HBM3",
|
| 178 |
+
"gpu_count": 8,
|
| 179 |
+
"disk": {
|
| 180 |
+
"/": {
|
| 181 |
+
"total": "2907329073152",
|
| 182 |
+
"used": "390033698816"
|
| 183 |
+
}
|
| 184 |
+
},
|
| 185 |
+
"memory": {
|
| 186 |
+
"total": "1014522519552"
|
| 187 |
+
},
|
| 188 |
+
"gpu_nvidia": [
|
| 189 |
+
{
|
| 190 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 191 |
+
"memoryTotal": "85520809984",
|
| 192 |
+
"cudaCores": 16896,
|
| 193 |
+
"architecture": "Hopper",
|
| 194 |
+
"uuid": "GPU-b4eb54a1-d73f-9179-04f2-b231b6a39a34"
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 198 |
+
"memoryTotal": "85520809984",
|
| 199 |
+
"cudaCores": 16896,
|
| 200 |
+
"architecture": "Hopper",
|
| 201 |
+
"uuid": "GPU-742b8534-6865-3da0-d864-a822a5d5d629"
|
| 202 |
+
},
|
| 203 |
+
{
|
| 204 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 205 |
+
"memoryTotal": "85520809984",
|
| 206 |
+
"cudaCores": 16896,
|
| 207 |
+
"architecture": "Hopper",
|
| 208 |
+
"uuid": "GPU-19d31f38-1be1-eced-5b78-b0d3f4deae56"
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 212 |
+
"memoryTotal": "85520809984",
|
| 213 |
+
"cudaCores": 16896,
|
| 214 |
+
"architecture": "Hopper",
|
| 215 |
+
"uuid": "GPU-1b41c967-636a-e3f1-5d74-2da616c06a3e"
|
| 216 |
+
},
|
| 217 |
+
{
|
| 218 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 219 |
+
"memoryTotal": "85520809984",
|
| 220 |
+
"cudaCores": 16896,
|
| 221 |
+
"architecture": "Hopper",
|
| 222 |
+
"uuid": "GPU-51d2a37c-3157-7b61-73b3-4a7914884549"
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 226 |
+
"memoryTotal": "85520809984",
|
| 227 |
+
"cudaCores": 16896,
|
| 228 |
+
"architecture": "Hopper",
|
| 229 |
+
"uuid": "GPU-499c302d-fc3c-c679-61e7-a2c4d46b8449"
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 233 |
+
"memoryTotal": "85520809984",
|
| 234 |
+
"cudaCores": 16896,
|
| 235 |
+
"architecture": "Hopper",
|
| 236 |
+
"uuid": "GPU-8eac440c-b327-20e6-1809-ef3e549d6c6d"
|
| 237 |
+
},
|
| 238 |
+
{
|
| 239 |
+
"name": "NVIDIA H100 80GB HBM3",
|
| 240 |
+
"memoryTotal": "85520809984",
|
| 241 |
+
"cudaCores": 16896,
|
| 242 |
+
"architecture": "Hopper",
|
| 243 |
+
"uuid": "GPU-5eba09b7-f1fc-555e-d07c-7b6227264759"
|
| 244 |
+
}
|
| 245 |
+
],
|
| 246 |
+
"cudaVersion": "12.8",
|
| 247 |
+
"writerId": "9vte3cwjfuxykvlnatinaorhdm7hrpxl"
|
| 248 |
+
}
|
wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_timestamp":1.7588892349026182e+09,"throughput":142.40106310791458,"_wandb":{"runtime":310590},"_runtime":310590.549572079,"_step":43729,"iteration-time":7.31538462638855}
|
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-core.log
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-09-22T22:04:05.949917481Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpvwq5pw5s/port-40865.txt","pid":40865,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
|
| 2 |
+
{"time":"2025-09-22T22:04:05.951064803Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":40865}
|
| 3 |
+
{"time":"2025-09-22T22:04:05.951060304Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-40865-41784-466403699/socket","Net":"unix"}}
|
| 4 |
+
{"time":"2025-09-22T22:04:06.13426601Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
|
| 5 |
+
{"time":"2025-09-22T22:04:06.143173217Z","level":"INFO","msg":"handleInformInit: received","streamId":"hrldy3bw","id":"1(@)"}
|
| 6 |
+
{"time":"2025-09-22T22:04:06.487650443Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hrldy3bw","id":"1(@)"}
|
| 7 |
+
{"time":"2025-09-26T12:20:37.296353193Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
|
| 8 |
+
{"time":"2025-09-26T12:20:37.296955718Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
|
| 9 |
+
{"time":"2025-09-26T12:20:37.297053401Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
|
| 10 |
+
{"time":"2025-09-26T12:20:37.297072505Z","level":"INFO","msg":"server is shutting down"}
|
| 11 |
+
{"time":"2025-09-26T12:20:37.297624126Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-40865-41784-466403699/socket","Net":"unix"}}
|
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2025-09-22T22:04:06.143255097Z","level":"INFO","msg":"stream: starting","core version":"0.22.0"}
|
| 2 |
+
{"time":"2025-09-22T22:04:06.487372274Z","level":"INFO","msg":"stream: created new stream","id":"hrldy3bw"}
|
| 3 |
+
{"time":"2025-09-22T22:04:06.487645224Z","level":"INFO","msg":"stream: started","id":"hrldy3bw"}
|
| 4 |
+
{"time":"2025-09-22T22:04:06.487690464Z","level":"INFO","msg":"sender: started","stream_id":"hrldy3bw"}
|
| 5 |
+
{"time":"2025-09-22T22:04:06.487691275Z","level":"INFO","msg":"writer: started","stream_id":"hrldy3bw"}
|
| 6 |
+
{"time":"2025-09-22T22:04:06.487752261Z","level":"INFO","msg":"handler: started","stream_id":"hrldy3bw"}
|
| 7 |
+
{"time":"2025-09-23T20:24:34.768930029Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 8 |
+
{"time":"2025-09-24T15:33:04.643961764Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
|
| 9 |
+
{"time":"2025-09-24T15:48:05.092383968Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
|
| 10 |
+
{"time":"2025-09-24T16:41:14.895690245Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 11 |
+
{"time":"2025-09-25T16:26:04.894084919Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 12 |
+
{"time":"2025-09-25T17:26:05.511033911Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
|
| 13 |
+
{"time":"2025-09-25T22:42:38.263384097Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 14 |
+
{"time":"2025-09-25T23:02:46.550701182Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
|
| 15 |
+
{"time":"2025-09-26T12:20:37.296973248Z","level":"INFO","msg":"stream: closing","id":"hrldy3bw"}
|
| 16 |
+
{"time":"2025-09-26T12:20:38.848817943Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
|
| 17 |
+
{"time":"2025-09-26T12:20:39.03451843Z","level":"INFO","msg":"handler: closed","stream_id":"hrldy3bw"}
|
| 18 |
+
{"time":"2025-09-26T12:20:39.034655715Z","level":"INFO","msg":"sender: closed","stream_id":"hrldy3bw"}
|
| 19 |
+
{"time":"2025-09-26T12:20:39.034688219Z","level":"INFO","msg":"stream: closed","id":"hrldy3bw"}
|
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Current SDK version is 0.22.0
|
| 2 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Configure stats pid to 40865
|
| 3 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
|
| 4 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /home/ubuntu/training/Ling-V2/wandb/settings
|
| 5 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 6 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():686] Logging user logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
|
| 7 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
|
| 8 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():813] calling init triggers
|
| 9 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
|
| 10 |
+
config: {'num_layers': 16, 'encoder_num_layers': 16, 'decoder_num_layers': None, 'hidden_size': 2048, 'ffn_hidden_size': 2048, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 4, 'max_position_embeddings': 8192, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 84000, 'rotary_percent': 0.5, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': True, 'multi_latent_attention': False, 'mtp_num_layers': 0, 'mtp_loss_scaling_factor': 0.1, 'bias_zero_mean_update': True, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 8, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': None, 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': None, 'recompute_num_layers': None, 'recompute_modules': None, 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': False, 'train_sync_interval': None, 'train_iters': 50000, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': 'poziomka_5/runs', 'masked_softmax_fusion': False, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': True, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'single', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': True, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 50, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'init_method_xavier_uniform': False, 'lr': 0.0008, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': None, 'lr_warmup_iters': 100, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 4e-05, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': 'poziomka_5', 'save_interval': 1600, 'no_save_optim': None, 'no_save_rng': None, 'load': None, 'no_load_optim': None, 'no_load_rng': None, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': False, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': True, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': False, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': False, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': True, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 4, 'encoder_tensor_model_parallel_size': 0, 'pipeline_model_parallel_size': 1, 'encoder_pipeline_model_parallel_size': 0, 'pipeline_model_parallel_split_rank': None, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 10, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': False, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': False, 'nccl_ub': False, 'use_sharp': False, 'use_custom_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache_when_using_custom_fsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'eval_iters': 2, 'eval_interval': 1600, 'test_mode': False, 'skip_train': False, 'data_path': ['szypulka_tokenized_apt4_merged/apt4_merged_text_document'], 'split': '9999,1,0', 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 8192, 'encoder_seq_length': 8192, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 2, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': True, 'create_attention_mask_in_dataloader': False, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': 32000, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': 'HuggingFaceTokenizer', 'tokenizer_model': '/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4', 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 2, 'expert_tensor_parallel_size': 1, 'num_experts': 128, 'moe_layer_freq': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'moe_ffn_hidden_size': 320, 'moe_shared_expert_intermediate_size': 320, 'moe_shared_expert_overlap': True, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'skip_casting_dtype_for_param_pattern': '["^expert_bias$|.+\\.expert_bias$"]', 'moe_router_score_function': 'sigmoid', 'moe_router_topk': 4, 'moe_router_pre_softmax': False, 'moe_router_num_groups': 8, 'moe_router_group_topk': 2, 'moe_router_topk_scaling_factor': 2.5, 'moe_router_enable_expert_bias': True, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': 3.5e-06, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': 1.25, 'moe_pad_expert_input_to_capacity': True, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 40.0, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': True, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 1000, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': True, 'wandb_project': 'poziomka', 'wandb_exp_name': 'poziomka_5', 'wandb_save_dir': '', 'logging_level': None, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': False, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'disabled', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, 'padded_vocab_size': 32256, '_wandb': {}}
|
| 11 |
+
2025-09-22 22:04:05,929 INFO MainThread:40865 [wandb_init.py:init():861] starting backend
|
| 12 |
+
2025-09-22 22:04:06,134 INFO MainThread:40865 [wandb_init.py:init():864] sending inform_init request
|
| 13 |
+
2025-09-22 22:04:06,137 INFO MainThread:40865 [wandb_init.py:init():872] backend started and connected
|
| 14 |
+
2025-09-22 22:04:06,140 INFO MainThread:40865 [wandb_init.py:init():942] updated telemetry
|
| 15 |
+
2025-09-22 22:04:06,144 INFO MainThread:40865 [wandb_init.py:init():966] communicating run to backend with 90.0 second timeout
|
| 16 |
+
2025-09-22 22:04:06,744 INFO MainThread:40865 [wandb_init.py:init():1017] starting run threads in backend
|
| 17 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_console_start():2506] atexit reg
|
| 18 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2354] redirect: wrap_raw
|
| 19 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2423] Wrapping output streams.
|
| 20 |
+
2025-09-22 22:04:06,836 INFO MainThread:40865 [wandb_run.py:_redirect():2446] Redirects installed.
|
| 21 |
+
2025-09-22 22:04:06,838 INFO MainThread:40865 [wandb_init.py:init():1057] run started, returning control to user process
|
| 22 |
+
2025-09-26 12:20:37,273 INFO wandb-AsyncioManager-main:40865 [service_client.py:_forward_responses():84] Reached EOF.
|
| 23 |
+
2025-09-26 12:20:37,275 INFO wandb-AsyncioManager-main:40865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
|
wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b28ec29fb9c8346c89c54dad96553569524449851aa8e3717d5c6f3b593f0eeb
|
| 3 |
+
size 112983277
|