switchoff commited on Sep 26, 2025

Commit

aa2de13

verified ·

1 Parent(s): 63b9e03

Add files using upload-large-folder tool

Browse files

Files changed (17) hide show

.gitattributes +3 -0
latest_checkpointed_iteration.txt +1 -0
latest_wandb_artifact_path.txt +1 -0
log_0.txt +3 -0
pip_list.txt +154 -0
run_pretrain_poziomka_5.sh +200 -0
wandb/wandb/debug-internal.log +19 -0
wandb/wandb/debug.log +23 -0
wandb/wandb/run-20250922_220405-hrldy3bw/files/config.yaml +1288 -0
wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log +3 -0
wandb/wandb/run-20250922_220405-hrldy3bw/files/requirements.txt +173 -0
wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-metadata.json +248 -0
wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-summary.json +1 -0
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-core.log +11 -0
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log +19 -0
wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log +23 -0
wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log filter=lfs diff=lfs merge=lfs -text
+log_0.txt filter=lfs diff=lfs merge=lfs -text
+wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb filter=lfs diff=lfs merge=lfs -text

latest_checkpointed_iteration.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 43200

latest_wandb_artifact_path.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ adamo1139-no/poziomka

log_0.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8549446fab3880bb4addaebfd74001956042f3d33c467cc48fa2821e2e63b9d5
+size 16532379

pip_list.txt ADDED Viewed

	@@ -0,0 +1,154 @@

+Package                  Version
+------------------------ ----------------
+aiohappyeyeballs         2.6.1
+aiohttp                  3.12.15
+aiosignal                1.4.0
+annotated-types          0.7.0
+apex                     0.1
+async-timeout            5.0.1
+attrs                    21.2.0
+Automat                  20.2.0
+Babel                    2.8.0
+bcrypt                   3.2.0
+bitsandbytes             0.47.0
+blinker                  1.4
+certifi                  2020.6.20
+chardet                  4.0.0
+charset-normalizer       3.4.3
+click                    8.0.3
+cloud-init               25.1.2
+colorama                 0.4.4
+command-not-found        0.3
+configobj                5.0.6
+constantly               15.1.0
+cryptography             3.4.8
+datasets                 4.1.1
+dbus-python              1.2.18
+dill                     0.4.0
+distro                   1.7.0
+distro-info              1.1+ubuntu0.2
+einops                   0.8.1
+filelock                 3.19.1
+flash_attn_3             3.0.0b1
+frozenlist               1.7.0
+fsspec                   2025.9.0
+gitdb                    4.0.12
+GitPython                3.1.45
+hf_transfer              0.1.9
+hf-xet                   1.1.10
+httplib2                 0.20.2
+huggingface-hub          0.35.0
+hyperlink                21.0.0
+idna                     3.3
+importlib-metadata       4.6.4
+incremental              21.3.0
+jeepney                  0.7.1
+Jinja2                   3.0.3
+jsonpatch                1.32
+jsonpointer              2.0
+jsonschema               3.2.0
+keyring                  23.5.0
+launchpadlib             1.10.16
+lazr.restfulclient       0.14.4
+lazr.uri                 1.0.6
+MarkupSafe               2.0.1
+megatron-core            0.13.0
+ml_dtypes                0.5.3
+more-itertools           8.10.0
+mpmath                   1.3.0
+multidict                6.6.4
+multiprocess             0.70.16
+netifaces                0.11.0
+networkx                 3.3
+ninja                    1.13.0
+numpy                    1.26.4
+nvidia-cublas-cu12       12.9.1.4
+nvidia-cuda-cupti-cu12   12.9.79
+nvidia-cuda-nvrtc-cu12   12.9.86
+nvidia-cuda-runtime-cu12 12.9.79
+nvidia-cudnn-cu12        9.10.2.21
+nvidia-cufft-cu12        11.4.1.4
+nvidia-cufile-cu12       1.14.1.1
+nvidia-curand-cu12       10.3.10.19
+nvidia-cusolver-cu12     11.7.5.82
+nvidia-cusparse-cu12     12.5.10.65
+nvidia-cusparselt-cu12   0.7.1
+nvidia-nccl-cu12         2.27.3
+nvidia-nvjitlink-cu12    12.9.86
+nvidia-nvtx-cu12         12.9.79
+oauthlib                 3.2.0
+onnx                     1.19.0
+onnx-ir                  0.1.9
+onnxscript               0.3.1
+packaging                24.2
+pandas                   2.3.2
+pexpect                  4.8.0
+pillow                   11.0.0
+pip                      25.2
+platformdirs             4.4.0
+propcache                0.3.2
+protobuf                 6.32.1
+psutil                   7.1.0
+psutils                  3.3.11
+ptyprocess               0.7.0
+puremagic                1.30
+pyarrow                  21.0.0
+pyasn1                   0.4.8
+pyasn1-modules           0.2.1
+pybind11                 3.0.1
+pydantic                 2.11.9
+pydantic_core            2.33.2
+PyGObject                3.42.1
+PyHamcrest               2.0.2
+PyJWT                    2.3.0
+pyOpenSSL                21.0.0
+pyparsing                2.4.7
+pypdf                    6.1.0
+pyrsistent               0.18.1
+pyserial                 3.5
+python-apt               2.4.0+ubuntu4
+python-dateutil          2.9.0.post0
+python-debian            0.1.43+ubuntu1.1
+python-magic             0.4.24
+pytz                     2022.1
+PyYAML                   5.4.1
+regex                    2025.9.18
+requests                 2.32.5
+safetensors              0.6.2
+SecretStorage            3.3.1
+sentry-sdk               2.38.0
+service-identity         18.1.0
+setuptools               80.9.0
+six                      1.16.0
+smmap                    5.0.2
+sos                      4.8.2
+ssh-import-id            5.11
+sympy                    1.13.3
+systemd-python           234
+tiktoken                 0.11.0
+tokenizers               0.22.1
+torch                    2.8.0+cu129
+torchvision              0.23.0+cu129
+tqdm                     4.67.1
+transformer_engine       2.6.0.post1
+transformer_engine_cu12  2.6.0.post1
+transformer_engine_torch 2.6.0.post1
+transformers             4.56.2
+triton                   3.4.0
+Twisted                  22.1.0
+typing_extensions        4.15.0
+typing-inspection        0.4.1
+tzdata                   2025.2
+ubuntu-drivers-common    0.0.0
+ubuntu-pro-client        8001
+ufw                      0.36.1
+unattended-upgrades      0.1
+urllib3                  2.5.0
+wadllib                  1.3.6
+wandb                    0.22.0
+wheel                    0.45.1
+xkit                     0.0.0
+xxhash                   3.5.0
+yarl                     1.20.1
+zipp                     1.0.0
+zope.interface           5.4.0

run_pretrain_poziomka_5.sh ADDED Viewed

	@@ -0,0 +1,200 @@

+#!/bin/bash
+set -ex
+MODEL_PATH="" # no checkpoint needed for from-scratch training
+JOB_DIR="poziomka_5"
+DATA_PATH="szypulka_tokenized_apt4_merged/apt4_merged_text_document"
+MEGATRON_PATH="Megatron-LM-core_v0.13.0"
+mkdir -p ${JOB_DIR}
+CHECKPOINT_PATH=${JOB_DIR}
+TENSORBOARD_LOGS_PATH=${JOB_DIR}/runs
+if [[ $RANK -eq 0 ]]; then
+    cp -r ${0} ${JOB_DIR}
+    pip list > ${JOB_DIR}/pip_list.txt
+    python -m torch.utils.collect_env > ${JOB_DIR}/collect_env.txt
+fi
+GPUS_PER_NODE=$(nvidia-smi -L | wc -l)
+WORLD_SIZE=${WORLD_SIZE:-1}
+NODE_RANK=${RANK:-0}
+MASTER_ADDR=${MASTER_ADDR:-127.0.0.1}
+RANDOM_PORT=$[$RANDOM + 20000]
+MASTER_PORT=${MASTER_PORT:-$RANDOM_PORT}
+GPU_NUM=$((${GPUS_PER_NODE}*${WORLD_SIZE}))
+echo "---> from pytorch runtime, WORLD_SIZE: ${WORLD_SIZE}, NODE_RANK: ${NODE_RANK}, MASTER_ADDR: ${MASTER_ADDR}, MASTER_PORT: ${MASTER_PORT}"
+LAUNCHER=" \
+    torchrun \
+    --nproc_per_node ${GPUS_PER_NODE} \
+    --nnodes ${WORLD_SIZE} \
+    --node_rank ${NODE_RANK} \
+    --master_addr ${MASTER_ADDR} \
+    --master_port ${MASTER_PORT} \
+    "
+LOG_PATH="${JOB_DIR}/log_${NODE_RANK}.txt"
+export OMP_NUM_THREADS=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # needed to keep at 1 as per https://github.com/NVIDIA/Megatron-LM/issues/533
+export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"
+export NCCL_NVLS_ENABLE=0
+export NCCL_CUMEM_ENABLE=0
+export NVTE_FLASH_ATTN=1 # get that sweet FA3 boost
+export NVTE_FUSED_ATTN=0
+export NVTE_UNFUSED_ATTN=0
+export NVTE_DEBUG=1
+export NVTE_DEBUG_LEVEL=2  # 2 means DEBUG level
+export NCCL_DEBUG=OFF
+DEVICE_MODEL=$(nvidia-smi -i 0 -q | grep "Product Name" | awk -F: '{ print $2 }')
+DEVICE_MODEL=$(echo "$DEVICE_MODEL" | xargs)  # drop white space
+if [[ $DEVICE_MODEL == NVIDIA* ]]; then
+    DEVICE_MODEL=${DEVICE_MODEL#"NVIDIA"}
+    DEVICE_MODEL=$(echo "$DEVICE_MODEL" | sed 's/^ *//')
+fi
+if [ "$DEVICE_MODEL" = "NVIDIA GeForce RTX 3090 Ti" ] || [ "$DEVICE_MODEL" = "A100-SXM4-80GB" ]; then
+    # Ampere GPUs do not support multicast. If `--tp-comm-overlap` is set on Ampere-arch GPUs, this env must be set.
+    export UB_SKIPMC=1
+fi
+MOE_ARGS=(
+    --expert-model-parallel-size 2
+    --expert-tensor-parallel-size 1
+    --moe-grouped-gemm
+    --moe-token-dispatcher-type alltoall
+    --moe-router-dtype fp32
+    --num-experts 128
+    --moe-ffn-hidden-size 320
+    --moe-shared-expert-intermediate-size 320
+    --moe-router-score-function sigmoid
+    --moe-router-topk 4
+    --moe-router-enable-expert-bias
+    --moe-router-topk-scaling-factor 2.5
+    --moe-router-num-groups 8
+    --moe-router-group-topk 2
+    --moe-z-loss-coeff 0.0000035
+    --moe-router-bias-update-rate 1e-3
+    --moe-layer-freq [0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+    --bias-zero-mean-update
+    --moe-expert-capacity-factor 1.25
+    --moe-pad-expert-input-to-capacity
+    --moe-shared-expert-overlap
+)
+MPT_ARGS=(
+    --mtp-num-layers 0
+)
+GPT_MODEL_ARGS=(
+    --num-layers 16
+    --hidden-size 2048
+    --ffn-hidden-size 2048
+    --num-attention-heads 16
+    --num-query-groups 4
+    --group-query-attention
+    --qk-layernorm
+    --use-flash-attn
+    --max-position-embeddings 8192
+    --vocab-size 32000
+    --make-vocab-size-divisible-by 128
+    --position-embedding-type "rope"
+    --rotary-base 84000
+    --rotary-percent 0.5
+    --rotary-scaling-factor 40
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --normalization "RMSNorm"
+    --norm-epsilon "1e-06"
+    --disable-bias-linear
+    --transformer-impl "transformer_engine"
+    --attention-dropout 0
+    --hidden-dropout 0
+)
+TRAINING_ARGS=(
+    --micro-batch-size 8
+    --global-batch-size 256
+    --seq-length 8192
+    --train-iters 50000
+    --weight-decay 0.1
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --init-method-std 0.02
+    --clip-grad 1.0
+    --bf16
+    --optimizer "adam"
+    --lr "8.0e-4"
+    --lr-decay-style cosine
+    --min-lr "4.00e-5"
+    --lr-warmup-iters 100
+    --seed 50
+)
+MODEL_PARALLEL_ARGS=(
+    --pipeline-model-parallel-size 1
+    --tensor-model-parallel-size 4
+    --sequence-parallel
+    --overlap-grad-reduce
+)
+DATA_ARGS=(
+    --data-path ${DATA_PATH}
+    --tokenizer-type "HuggingFaceTokenizer"
+    --tokenizer-model `dirname $(readlink -f "${BASH_SOURCE[0]}")`/../../resource/tokenizer/apt4
+    --split 9999,1,0
+    --dataloader-type "single"
+    --no-create-attention-mask-in-dataloader
+    --eod-mask-loss
+)
+EVAL_AND_LOGGING_ARGS=(
+    --save-interval 1600
+    --eval-interval 1600
+    --eval-iters 2
+    --save $CHECKPOINT_PATH
+    --ckpt-format "torch_dist"
+    --async-save
+    --log-interval 1
+    --log-throughput
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH
+    --log-timers-to-tensorboard
+    --log-memory-to-tensorboard
+    --log-world-size-to-tensorboard
+    --log-validation-ppl-to-tensorboard
+    --wandb-project "poziomka"
+    --wandb-exp-name ${JOB_DIR}
+)
+KERNEL_ARGS=(
+    --attention-backend flash
+    --no-masked-softmax-fusion
+    --attention-softmax-in-fp32
+    --cross-entropy-loss-fusion
+)
+CMD="${LAUNCHER} ${MEGATRON_PATH}/pretrain_gpt.py \
+    ${MOE_ARGS[@]} \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]} \
+    ${KERNEL_ARGS[@]} \
+    ${MPT_ARGS[@]} \
+    ${PROFILING_ARGS[@]} \
+"
+echo ${CMD}
+PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH ${CMD} 2>&1 | tee ${LOG_PATH}

wandb/wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,19 @@

+{"time":"2025-09-22T22:04:06.143255097Z","level":"INFO","msg":"stream: starting","core version":"0.22.0"}
+{"time":"2025-09-22T22:04:06.487372274Z","level":"INFO","msg":"stream: created new stream","id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487645224Z","level":"INFO","msg":"stream: started","id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487690464Z","level":"INFO","msg":"sender: started","stream_id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487691275Z","level":"INFO","msg":"writer: started","stream_id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487752261Z","level":"INFO","msg":"handler: started","stream_id":"hrldy3bw"}
+{"time":"2025-09-23T20:24:34.768930029Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-24T15:33:04.643961764Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
+{"time":"2025-09-24T15:48:05.092383968Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-24T16:41:14.895690245Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-25T16:26:04.894084919Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-25T17:26:05.511033911Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-25T22:42:38.263384097Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-25T23:02:46.550701182Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-26T12:20:37.296973248Z","level":"INFO","msg":"stream: closing","id":"hrldy3bw"}
+{"time":"2025-09-26T12:20:38.848817943Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-09-26T12:20:39.03451843Z","level":"INFO","msg":"handler: closed","stream_id":"hrldy3bw"}
+{"time":"2025-09-26T12:20:39.034655715Z","level":"INFO","msg":"sender: closed","stream_id":"hrldy3bw"}
+{"time":"2025-09-26T12:20:39.034688219Z","level":"INFO","msg":"stream: closed","id":"hrldy3bw"}

wandb/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,23 @@

+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Current SDK version is 0.22.0
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Configure stats pid to 40865
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /home/ubuntu/training/Ling-V2/wandb/settings
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:setup_run_log_directory():686] Logging user logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:init():813] calling init triggers
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
+config: {'num_layers': 16, 'encoder_num_layers': 16, 'decoder_num_layers': None, 'hidden_size': 2048, 'ffn_hidden_size': 2048, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 4, 'max_position_embeddings': 8192, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 84000, 'rotary_percent': 0.5, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': True, 'multi_latent_attention': False, 'mtp_num_layers': 0, 'mtp_loss_scaling_factor': 0.1, 'bias_zero_mean_update': True, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 8, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': None, 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': None, 'recompute_num_layers': None, 'recompute_modules': None, 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': False, 'train_sync_interval': None, 'train_iters': 50000, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': 'poziomka_5/runs', 'masked_softmax_fusion': False, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': True, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'single', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': True, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 50, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'init_method_xavier_uniform': False, 'lr': 0.0008, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': None, 'lr_warmup_iters': 100, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 4e-05, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': 'poziomka_5', 'save_interval': 1600, 'no_save_optim': None, 'no_save_rng': None, 'load': None, 'no_load_optim': None, 'no_load_rng': None, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': False, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': True, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': False, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': False, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': True, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 4, 'encoder_tensor_model_parallel_size': 0, 'pipeline_model_parallel_size': 1, 'encoder_pipeline_model_parallel_size': 0, 'pipeline_model_parallel_split_rank': None, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 10, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': False, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': False, 'nccl_ub': False, 'use_sharp': False, 'use_custom_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache_when_using_custom_fsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'eval_iters': 2, 'eval_interval': 1600, 'test_mode': False, 'skip_train': False, 'data_path': ['szypulka_tokenized_apt4_merged/apt4_merged_text_document'], 'split': '9999,1,0', 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 8192, 'encoder_seq_length': 8192, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 2, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': True, 'create_attention_mask_in_dataloader': False, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': 32000, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': 'HuggingFaceTokenizer', 'tokenizer_model': '/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4', 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 2, 'expert_tensor_parallel_size': 1, 'num_experts': 128, 'moe_layer_freq': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'moe_ffn_hidden_size': 320, 'moe_shared_expert_intermediate_size': 320, 'moe_shared_expert_overlap': True, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'skip_casting_dtype_for_param_pattern': '["^expert_bias$|.+\\.expert_bias$"]', 'moe_router_score_function': 'sigmoid', 'moe_router_topk': 4, 'moe_router_pre_softmax': False, 'moe_router_num_groups': 8, 'moe_router_group_topk': 2, 'moe_router_topk_scaling_factor': 2.5, 'moe_router_enable_expert_bias': True, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': 3.5e-06, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': 1.25, 'moe_pad_expert_input_to_capacity': True, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 40.0, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': True, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 1000, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': True, 'wandb_project': 'poziomka', 'wandb_exp_name': 'poziomka_5', 'wandb_save_dir': '', 'logging_level': None, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': False, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'disabled', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, 'padded_vocab_size': 32256, '_wandb': {}}
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:init():861] starting backend
+2025-09-22 22:04:06,134 INFO    MainThread:40865 [wandb_init.py:init():864] sending inform_init request
+2025-09-22 22:04:06,137 INFO    MainThread:40865 [wandb_init.py:init():872] backend started and connected
+2025-09-22 22:04:06,140 INFO    MainThread:40865 [wandb_init.py:init():942] updated telemetry
+2025-09-22 22:04:06,144 INFO    MainThread:40865 [wandb_init.py:init():966] communicating run to backend with 90.0 second timeout
+2025-09-22 22:04:06,744 INFO    MainThread:40865 [wandb_init.py:init():1017] starting run threads in backend
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_console_start():2506] atexit reg
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_redirect():2354] redirect: wrap_raw
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_redirect():2423] Wrapping output streams.
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_redirect():2446] Redirects installed.
+2025-09-22 22:04:06,838 INFO    MainThread:40865 [wandb_init.py:init():1057] run started, returning control to user process
+2025-09-26 12:20:37,273 INFO    wandb-AsyncioManager-main:40865 [service_client.py:_forward_responses():84] Reached EOF.
+2025-09-26 12:20:37,275 INFO    wandb-AsyncioManager-main:40865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.

wandb/wandb/run-20250922_220405-hrldy3bw/files/config.yaml ADDED Viewed

	@@ -0,0 +1,1288 @@

+_wandb:
+    value:
+        cli_version: 0.22.0
+        e:
+            9vte3cwjfuxykvlnatinaorhdm7hrpxl:
+                args:
+                    - --expert-model-parallel-size
+                    - "2"
+                    - --expert-tensor-parallel-size
+                    - "1"
+                    - --moe-grouped-gemm
+                    - --moe-token-dispatcher-type
+                    - alltoall
+                    - --moe-router-dtype
+                    - fp32
+                    - --num-experts
+                    - "128"
+                    - --moe-ffn-hidden-size
+                    - "320"
+                    - --moe-shared-expert-intermediate-size
+                    - "320"
+                    - --moe-router-score-function
+                    - sigmoid
+                    - --moe-router-topk
+                    - "4"
+                    - --moe-router-enable-expert-bias
+                    - --moe-router-topk-scaling-factor
+                    - "2.5"
+                    - --moe-router-num-groups
+                    - "8"
+                    - --moe-router-group-topk
+                    - "2"
+                    - --moe-z-loss-coeff
+                    - "0.0000035"
+                    - --moe-router-bias-update-rate
+                    - "1e-3"
+                    - --moe-layer-freq
+                    - '[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]'
+                    - --bias-zero-mean-update
+                    - --moe-expert-capacity-factor
+                    - "1.25"
+                    - --moe-pad-expert-input-to-capacity
+                    - --moe-shared-expert-overlap
+                    - --num-layers
+                    - "16"
+                    - --hidden-size
+                    - "2048"
+                    - --ffn-hidden-size
+                    - "2048"
+                    - --num-attention-heads
+                    - "16"
+                    - --num-query-groups
+                    - "4"
+                    - --group-query-attention
+                    - --qk-layernorm
+                    - --use-flash-attn
+                    - --max-position-embeddings
+                    - "8192"
+                    - --vocab-size
+                    - "32000"
+                    - --make-vocab-size-divisible-by
+                    - "128"
+                    - --position-embedding-type
+                    - rope
+                    - --rotary-base
+                    - "84000"
+                    - --rotary-percent
+                    - "0.5"
+                    - --rotary-scaling-factor
+                    - "40"
+                    - --swiglu
+                    - --untie-embeddings-and-output-weights
+                    - --normalization
+                    - RMSNorm
+                    - --norm-epsilon
+                    - "1e-06"
+                    - --disable-bias-linear
+                    - --transformer-impl
+                    - transformer_engine
+                    - --attention-dropout
+                    - "0"
+                    - --hidden-dropout
+                    - "0"
+                    - --micro-batch-size
+                    - "8"
+                    - --global-batch-size
+                    - "256"
+                    - --seq-length
+                    - "8192"
+                    - --train-iters
+                    - "50000"
+                    - --weight-decay
+                    - "0.1"
+                    - --adam-beta1
+                    - "0.9"
+                    - --adam-beta2
+                    - "0.95"
+                    - --init-method-std
+                    - "0.02"
+                    - --clip-grad
+                    - "1.0"
+                    - --bf16
+                    - --optimizer
+                    - adam
+                    - --lr
+                    - "8.0e-4"
+                    - --lr-decay-style
+                    - cosine
+                    - --min-lr
+                    - "4.00e-5"
+                    - --lr-warmup-iters
+                    - "100"
+                    - --seed
+                    - "50"
+                    - --pipeline-model-parallel-size
+                    - "1"
+                    - --tensor-model-parallel-size
+                    - "4"
+                    - --sequence-parallel
+                    - --overlap-grad-reduce
+                    - --data-path
+                    - szypulka_tokenized_apt4_merged/apt4_merged_text_document
+                    - --tokenizer-type
+                    - HuggingFaceTokenizer
+                    - --tokenizer-model
+                    - /home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4
+                    - --split
+                    - 9999,1,0
+                    - --dataloader-type
+                    - single
+                    - --no-create-attention-mask-in-dataloader
+                    - --eod-mask-loss
+                    - --save-interval
+                    - "1600"
+                    - --eval-interval
+                    - "1600"
+                    - --eval-iters
+                    - "2"
+                    - --save
+                    - poziomka_5
+                    - --ckpt-format
+                    - torch_dist
+                    - --async-save
+                    - --log-interval
+                    - "1"
+                    - --log-throughput
+                    - --tensorboard-dir
+                    - poziomka_5/runs
+                    - --log-timers-to-tensorboard
+                    - --log-memory-to-tensorboard
+                    - --log-world-size-to-tensorboard
+                    - --log-validation-ppl-to-tensorboard
+                    - --wandb-project
+                    - poziomka
+                    - --wandb-exp-name
+                    - poziomka_5
+                    - --attention-backend
+                    - flash
+                    - --no-masked-softmax-fusion
+                    - --attention-softmax-in-fp32
+                    - --cross-entropy-loss-fusion
+                    - --mtp-num-layers
+                    - "0"
+                codePath: Megatron-LM-core_v0.13.0/pretrain_gpt.py
+                codePathLocal: Megatron-LM-core_v0.13.0/pretrain_gpt.py
+                cpu_count: 128
+                cpu_count_logical: 128
+                cudaVersion: "12.8"
+                disk:
+                    /:
+                        total: "2907329073152"
+                        used: "390033698816"
+                email: adamo1139@gmail.com
+                executable: /usr/bin/python3
+                git:
+                    commit: e3867293ebf444f614164c2b84180cd75e7de07c
+                    remote: https://github.com/adamo1139/Ling-V2.git
+                gpu: NVIDIA H100 80GB HBM3
+                gpu_count: 8
+                gpu_nvidia:
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-b4eb54a1-d73f-9179-04f2-b231b6a39a34
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-742b8534-6865-3da0-d864-a822a5d5d629
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-19d31f38-1be1-eced-5b78-b0d3f4deae56
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-1b41c967-636a-e3f1-5d74-2da616c06a3e
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-51d2a37c-3157-7b61-73b3-4a7914884549
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-499c302d-fc3c-c679-61e7-a2c4d46b8449
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-8eac440c-b327-20e6-1809-ef3e549d6c6d
+                    - architecture: Hopper
+                      cudaCores: 16896
+                      memoryTotal: "85520809984"
+                      name: NVIDIA H100 80GB HBM3
+                      uuid: GPU-5eba09b7-f1fc-555e-d07c-7b6227264759
+                host: megatron6
+                memory:
+                    total: "1014522519552"
+                os: Linux-5.15.0-143-generic-x86_64-with-glibc2.35
+                program: /home/ubuntu/training/Ling-V2/Megatron-LM-core_v0.13.0/pretrain_gpt.py
+                python: CPython 3.10.12
+                root: poziomka_5/wandb
+                startedAt: "2025-09-22T22:04:05.928052Z"
+                writerId: 9vte3cwjfuxykvlnatinaorhdm7hrpxl
+        m: []
+        python_version: 3.10.12
+        t:
+            "1":
+                - 1
+                - 11
+                - 49
+            "2":
+                - 1
+                - 11
+                - 49
+            "3":
+                - 13
+                - 16
+                - 61
+            "4": 3.10.12
+            "5": 0.22.0
+            "6": 4.56.2
+            "12": 0.22.0
+            "13": linux-x86_64
+account_for_embedding_in_pipeline_split:
+    value: false
+account_for_loss_in_pipeline_split:
+    value: false
+accumulate_allreduce_grads_in_fp32:
+    value: true
+adam_beta1:
+    value: 0.9
+adam_beta2:
+    value: 0.95
+adam_eps:
+    value: 1e-08
+add_bias_linear:
+    value: false
+add_position_embedding:
+    value: true
+add_qkv_bias:
+    value: false
+adlr_autoresume:
+    value: false
+adlr_autoresume_interval:
+    value: 1000
+align_grad_reduce:
+    value: true
+align_param_gather:
+    value: false
+app_tag_run_name:
+    value: null
+app_tag_run_version:
+    value: 0.0.0
+apply_layernorm_1p:
+    value: false
+apply_query_key_layer_scaling:
+    value: false
+apply_residual_connection_post_layernorm:
+    value: false
+apply_rope_fusion:
+    value: true
+async_save:
+    value: true
+async_tensor_model_parallel_allreduce:
+    value: true
+attention_backend:
+    value: flash
+attention_dropout:
+    value: 0
+attention_softmax_in_fp32:
+    value: true
+auto_detect_ckpt_format:
+    value: false
+barrier_with_L1_time:
+    value: true
+bert_binary_head:
+    value: true
+bert_embedder_type:
+    value: megatron
+bert_load:
+    value: null
+bf16:
+    value: true
+bias_dropout_fusion:
+    value: true
+bias_gelu_fusion:
+    value: false
+bias_swiglu_fusion:
+    value: true
+bias_zero_mean_update:
+    value: true
+biencoder_projection_dim:
+    value: 0
+biencoder_shared_query_context_model:
+    value: false
+block_data_path:
+    value: null
+calc_ft_timeouts:
+    value: false
+calculate_per_token_loss:
+    value: false
+check_for_large_grads:
+    value: false
+check_for_nan_in_loss_and_grad:
+    value: true
+check_for_spiky_loss:
+    value: false
+check_weight_hash_across_dp_replicas_interval:
+    value: null
+ckpt_assume_constant_structure:
+    value: false
+ckpt_convert_format:
+    value: null
+ckpt_convert_save:
+    value: null
+ckpt_convert_update_legacy_dist_opt_format:
+    value: false
+ckpt_format:
+    value: torch_dist
+ckpt_fully_parallel_load:
+    value: false
+ckpt_fully_parallel_save:
+    value: true
+ckpt_fully_parallel_save_deprecated:
+    value: false
+ckpt_step:
+    value: null
+classes_fraction:
+    value: 1
+clip_grad:
+    value: 1
+clone_scatter_output_in_embedding:
+    value: true
+config_logger_dir:
+    value: ""
+consumed_train_samples:
+    value: 0
+consumed_valid_samples:
+    value: 0
+context_parallel_size:
+    value: 1
+cp_comm_type:
+    value:
+        - p2p
+create_attention_mask_in_dataloader:
+    value: false
+cross_entropy_fusion_impl:
+    value: native
+cross_entropy_loss_fusion:
+    value: true
+cuda_graph_scope:
+    value: full
+cuda_graph_warmup_steps:
+    value: 3
+data_args_path:
+    value: null
+data_cache_path:
+    value: null
+data_parallel_random_init:
+    value: false
+data_parallel_sharding_strategy:
+    value: no_shard
+data_parallel_size:
+    value: 2
+data_path:
+    value:
+        - szypulka_tokenized_apt4_merged/apt4_merged_text_document
+data_per_class_fraction:
+    value: 1
+data_sharding:
+    value: true
+dataloader_type:
+    value: single
+ddp_average_in_collective:
+    value: false
+ddp_bucket_size:
+    value: null
+ddp_num_buckets:
+    value: null
+ddp_pad_buckets_for_high_nccl_busbw:
+    value: false
+decoder_first_pipeline_num_layers:
+    value: null
+decoder_last_pipeline_num_layers:
+    value: null
+decoder_num_layers:
+    value: null
+decoder_seq_length:
+    value: null
+decoupled_lr:
+    value: null
+decoupled_min_lr:
+    value: null
+decrease_batch_size_if_needed:
+    value: false
+defer_embedding_wgrad_compute:
+    value: false
+delay_wgrad_compute:
+    value: false
+deprecated_use_mcore_models:
+    value: false
+deterministic_mode:
+    value: false
+dino_bottleneck_size:
+    value: 256
+dino_freeze_last_layer:
+    value: 1
+dino_head_hidden_size:
+    value: 2048
+dino_local_crops_number:
+    value: 10
+dino_local_img_size:
+    value: 96
+dino_norm_last_layer:
+    value: false
+dino_teacher_temp:
+    value: 0.07
+dino_warmup_teacher_temp:
+    value: 0.04
+dino_warmup_teacher_temp_epochs:
+    value: 30
+disable_bf16_reduced_precision_matmul:
+    value: false
+disable_mamba_mem_eff_path:
+    value: false
+disable_straggler_on_startup:
+    value: false
+dist_ckpt_format_deprecated:
+    value: null
+dist_ckpt_strictness:
+    value: assume_ok_unexpected
+distribute_saved_activations:
+    value: false
+distributed_backend:
+    value: nccl
+distributed_timeout_minutes:
+    value: 10
+embedding_path:
+    value: null
+empty_unused_memory_level:
+    value: 0
+enable_cuda_graph:
+    value: false
+enable_experimental:
+    value: false
+enable_ft_package:
+    value: false
+enable_gloo_process_groups:
+    value: true
+enable_msc:
+    value: true
+enable_one_logger:
+    value: true
+encoder_num_layers:
+    value: 16
+encoder_pipeline_model_parallel_size:
+    value: 0
+encoder_seq_length:
+    value: 8192
+encoder_tensor_model_parallel_size:
+    value: 0
+end_weight_decay:
+    value: 0.1
+eod_mask_loss:
+    value: true
+error_injection_rate:
+    value: 0
+error_injection_type:
+    value: transient_error
+eval_interval:
+    value: 1600
+eval_iters:
+    value: 2
+evidence_data_path:
+    value: null
+exit_duration_in_mins:
+    value: null
+exit_interval:
+    value: null
+exit_on_missing_checkpoint:
+    value: false
+exit_signal_handler:
+    value: false
+exp_avg_dtype:
+    value: torch.float32
+exp_avg_sq_dtype:
+    value: torch.float32
+expert_model_parallel_size:
+    value: 2
+expert_tensor_parallel_size:
+    value: 1
+external_cuda_graph:
+    value: false
+ffn_hidden_size:
+    value: 2048
+finetune:
+    value: false
+first_last_layers_bf16:
+    value: false
+flash_decode:
+    value: false
+fp8:
+    value: null
+fp8_amax_compute_algo:
+    value: most_recent
+fp8_amax_history_len:
+    value: 1
+fp8_interval:
+    value: 1
+fp8_margin:
+    value: 0
+fp8_param_gather:
+    value: false
+fp8_recipe:
+    value: delayed
+fp8_wgrad:
+    value: true
+fp16:
+    value: false
+fp16_lm_cross_entropy:
+    value: false
+fp32_residual_connection:
+    value: false
+fsdp_double_buffer:
+    value: false
+global_batch_size:
+    value: 256
+grad_reduce_in_bf16:
+    value: false
+gradient_accumulation_fusion:
+    value: true
+gradient_reduce_div_fusion:
+    value: true
+group_query_attention:
+    value: true
+head_lr_mult:
+    value: 1
+heterogeneous_layers_config_encoded_json:
+    value: null
+heterogeneous_layers_config_path:
+    value: null
+hidden_dropout:
+    value: 0
+hidden_size:
+    value: 2048
+hierarchical_context_parallel_sizes:
+    value: null
+high_priority_stream_groups:
+    value: []
+hybrid_attention_ratio:
+    value: 0
+hybrid_mlp_ratio:
+    value: 0
+hybrid_override_pattern:
+    value: null
+hysteresis:
+    value: 2
+ict_head_size:
+    value: null
+ict_load:
+    value: null
+img_h:
+    value: 224
+img_w:
+    value: 224
+indexer_batch_size:
+    value: 128
+indexer_log_interval:
+    value: 1000
+inference_batch_times_seqlen_threshold:
+    value: -1
+inference_dynamic_batching:
+    value: false
+inference_dynamic_batching_buffer_guaranteed_fraction:
+    value: 0.2
+inference_dynamic_batching_buffer_overflow_factor:
+    value: null
+inference_dynamic_batching_buffer_size_gb:
+    value: 40
+inference_dynamic_batching_chunk_size:
+    value: 256
+inference_dynamic_batching_max_requests_override:
+    value: null
+inference_dynamic_batching_max_tokens_override:
+    value: null
+inference_max_batch_size:
+    value: 8
+inference_max_seq_length:
+    value: 2560
+inference_rng_tracker:
+    value: false
+init_method_std:
+    value: 0.02
+init_method_xavier_uniform:
+    value: false
+init_model_with_meta_device:
+    value: false
+initial_loss_scale:
+    value: 4294967296
+inprocess_active_world_size:
+    value: 8
+inprocess_barrier_timeout:
+    value: 120
+inprocess_completion_timeout:
+    value: 120
+inprocess_empty_cuda_cache:
+    value: false
+inprocess_granularity:
+    value: node
+inprocess_hard_timeout:
+    value: 90
+inprocess_heartbeat_interval:
+    value: 30
+inprocess_heartbeat_timeout:
+    value: 60
+inprocess_last_call_wait:
+    value: 1
+inprocess_max_iterations:
+    value: null
+inprocess_monitor_process_interval:
+    value: 1
+inprocess_monitor_thread_interval:
+    value: 1
+inprocess_progress_watchdog_interval:
+    value: 1
+inprocess_restart:
+    value: false
+inprocess_soft_timeout:
+    value: 60
+inprocess_termination_grace_time:
+    value: 1
+is_hybrid_model:
+    value: false
+iter_per_epoch:
+    value: 1250
+iterations_to_skip:
+    value: []
+keep_fp8_transpose_cache_when_using_custom_fsdp:
+    value: false
+kitchen_config_file:
+    value: null
+kitchen_recipe_number:
+    value: null
+kv_channels:
+    value: 128
+kv_lora_rank:
+    value: 32
+lazy_mpu_init:
+    value: null
+load:
+    value: null
+load_model_opt_format:
+    value: false
+local_rank:
+    value: 7
+log_energy:
+    value: false
+log_interval:
+    value: 1
+log_loss_scale_to_tensorboard:
+    value: true
+log_memory_to_tensorboard:
+    value: true
+log_num_zeros_in_grad:
+    value: false
+log_params_norm:
+    value: false
+log_progress:
+    value: false
+log_straggler:
+    value: false
+log_throughput:
+    value: true
+log_timers_to_tensorboard:
+    value: true
+log_validation_ppl_to_tensorboard:
+    value: true
+log_world_size_to_tensorboard:
+    value: true
+logging_level:
+    value: null
+loss_scale:
+    value: null
+loss_scale_window:
+    value: 1000
+lr:
+    value: 0.0008
+lr_decay_iters:
+    value: null
+lr_decay_samples:
+    value: null
+lr_decay_style:
+    value: cosine
+lr_warmup_fraction:
+    value: null
+lr_warmup_init:
+    value: 0
+lr_warmup_iters:
+    value: 100
+lr_warmup_samples:
+    value: 0
+lr_wsd_decay_iters:
+    value: null
+lr_wsd_decay_samples:
+    value: null
+lr_wsd_decay_style:
+    value: exponential
+main_grads_dtype:
+    value: torch.float32
+main_params_dtype:
+    value: torch.float32
+make_vocab_size_divisible_by:
+    value: 128
+mamba_head_dim:
+    value: 64
+mamba_num_groups:
+    value: 8
+mamba_num_heads:
+    value: null
+mamba_state_dim:
+    value: 128
+manual_gc:
+    value: false
+manual_gc_eval:
+    value: true
+manual_gc_interval:
+    value: 0
+mask_factor:
+    value: 1
+mask_prob:
+    value: 0.15
+mask_type:
+    value: random
+masked_softmax_fusion:
+    value: false
+max_position_embeddings:
+    value: 8192
+max_tokens_to_oom:
+    value: 12000
+memory_snapshot_path:
+    value: snapshot.pickle
+merge_file:
+    value: null
+micro_batch_size:
+    value: 8
+microbatch_group_size_per_vp_stage:
+    value: null
+mid_level_dataset_surplus:
+    value: 0.005
+min_loss_scale:
+    value: 1
+min_lr:
+    value: 4e-05
+mlp_chunks_for_prefill:
+    value: 1
+mmap_bin_files:
+    value: true
+mock_data:
+    value: false
+moe_apply_probs_on_input:
+    value: false
+moe_aux_loss_coeff:
+    value: 0
+moe_deepep_num_sms:
+    value: 20
+moe_enable_deepep:
+    value: false
+moe_expert_capacity_factor:
+    value: 1.25
+moe_extended_tp:
+    value: false
+moe_ffn_hidden_size:
+    value: 320
+moe_grouped_gemm:
+    value: true
+moe_input_jitter_eps:
+    value: null
+moe_layer_freq:
+    value:
+        - 0
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+        - 1
+moe_layer_recompute:
+    value: false
+moe_pad_expert_input_to_capacity:
+    value: true
+moe_per_layer_logging:
+    value: false
+moe_permute_fusion:
+    value: false
+moe_router_bias_update_rate:
+    value: 0.001
+moe_router_dtype:
+    value: fp32
+moe_router_enable_expert_bias:
+    value: true
+moe_router_force_load_balancing:
+    value: false
+moe_router_group_topk:
+    value: 2
+moe_router_load_balancing_type:
+    value: aux_loss
+moe_router_num_groups:
+    value: 8
+moe_router_padding_for_fp8:
+    value: false
+moe_router_pre_softmax:
+    value: false
+moe_router_score_function:
+    value: sigmoid
+moe_router_topk:
+    value: 4
+moe_router_topk_scaling_factor:
+    value: 2.5
+moe_shared_expert_intermediate_size:
+    value: 320
+moe_shared_expert_overlap:
+    value: true
+moe_token_dispatcher_type:
+    value: alltoall
+moe_token_drop_policy:
+    value: probs
+moe_upcycling_granularity:
+    value: 1
+moe_use_legacy_grouped_gemm:
+    value: false
+moe_use_upcycling:
+    value: false
+moe_z_loss_coeff:
+    value: 3.5e-06
+mrope_section:
+    value: null
+mscale:
+    value: 1
+mscale_all_dim:
+    value: 1
+mtp_loss_scaling_factor:
+    value: 0.1
+mtp_num_layers:
+    value: 0
+multi_latent_attention:
+    value: false
+nccl_all_reduce_for_prefill:
+    value: false
+nccl_communicator_config_path:
+    value: null
+nccl_ub:
+    value: false
+no_load_optim:
+    value: null
+no_load_rng:
+    value: null
+no_persist_layer_norm:
+    value: false
+no_rope_freq:
+    value: null
+no_save_optim:
+    value: null
+no_save_rng:
+    value: null
+non_persistent_ckpt_type:
+    value: null
+non_persistent_global_ckpt_dir:
+    value: null
+non_persistent_local_ckpt_algo:
+    value: fully_parallel
+non_persistent_local_ckpt_dir:
+    value: null
+non_persistent_save_interval:
+    value: null
+norm_epsilon:
+    value: 1e-06
+normalization:
+    value: RMSNorm
+num_attention_heads:
+    value: 16
+num_channels:
+    value: 3
+num_classes:
+    value: 1000
+num_dataset_builder_threads:
+    value: 1
+num_distributed_optimizer_instances:
+    value: 1
+num_experts:
+    value: 128
+num_layers:
+    value: 16
+num_layers_at_end_in_bf16:
+    value: 1
+num_layers_at_start_in_bf16:
+    value: 1
+num_layers_per_virtual_pipeline_stage:
+    value: null
+num_query_groups:
+    value: 4
+num_virtual_stages_per_pipeline_rank:
+    value: null
+num_workers:
+    value: 2
+object_storage_cache_path:
+    value: null
+one_logger_async:
+    value: false
+one_logger_project:
+    value: megatron-lm
+one_logger_run_name:
+    value: null
+onnx_safe:
+    value: null
+openai_gelu:
+    value: false
+optimizer:
+    value: adam
+optimizer_cpu_offload:
+    value: false
+optimizer_offload_fraction:
+    value: 1
+output_bert_embeddings:
+    value: false
+overlap_cpu_optimizer_d2h_h2d:
+    value: false
+overlap_grad_reduce:
+    value: true
+overlap_p2p_comm:
+    value: false
+overlap_p2p_comm_warmup_flush:
+    value: false
+overlap_param_gather:
+    value: false
+overlap_param_gather_with_optimizer_step:
+    value: false
+override_opt_param_scheduler:
+    value: false
+padded_vocab_size:
+    value: 32256
+params_dtype:
+    value: torch.bfloat16
+patch_dim:
+    value: 16
+per_split_data_args_path:
+    value: null
+perform_initialization:
+    value: true
+pin_cpu_grads:
+    value: true
+pin_cpu_params:
+    value: true
+pipeline_model_parallel_comm_backend:
+    value: null
+pipeline_model_parallel_layout:
+    value: null
+pipeline_model_parallel_size:
+    value: 1
+pipeline_model_parallel_split_rank:
+    value: null
+position_embedding_type:
+    value: rope
+pretrained_checkpoint:
+    value: null
+profile:
+    value: false
+profile_ranks:
+    value:
+        - 0
+profile_step_end:
+    value: 12
+profile_step_start:
+    value: 10
+q_lora_rank:
+    value: null
+qk_head_dim:
+    value: 128
+qk_l2_norm:
+    value: false
+qk_layernorm:
+    value: true
+qk_pos_emb_head_dim:
+    value: 64
+query_in_block_prob:
+    value: 0.1
+rampup_batch_size:
+    value: null
+rank:
+    value: 7
+recompute_granularity:
+    value: null
+recompute_method:
+    value: null
+recompute_modules:
+    value: null
+recompute_num_layers:
+    value: null
+record_memory_history:
+    value: false
+relative_attention_max_distance:
+    value: 128
+relative_attention_num_buckets:
+    value: 32
+replication:
+    value: false
+replication_factor:
+    value: 2
+replication_jump:
+    value: null
+rerun_mode:
+    value: disabled
+reset_attention_mask:
+    value: false
+reset_position_ids:
+    value: false
+result_rejected_tracker_filename:
+    value: null
+retriever_report_topk_accuracies:
+    value: []
+retriever_score_scaling:
+    value: false
+retriever_seq_length:
+    value: 256
+retro_add_retriever:
+    value: false
+retro_attention_gate:
+    value: 1
+retro_cyclic_train_iters:
+    value: null
+retro_encoder_attention_dropout:
+    value: 0.1
+retro_encoder_hidden_dropout:
+    value: 0.1
+retro_encoder_layers:
+    value: 2
+retro_num_neighbors:
+    value: 2
+retro_num_retrieved_chunks:
+    value: 2
+retro_project_dir:
+    value: null
+retro_verify_neighbor_count:
+    value: true
+reuse_grad_buf_for_mxfp8_param_ag:
+    value: false
+rope_scaling_factor:
+    value: 8
+rotary_base:
+    value: 84000
+rotary_interleaved:
+    value: false
+rotary_percent:
+    value: 0.5
+rotary_scaling_factor:
+    value: 40
+rotary_seq_len_interpolation_factor:
+    value: null
+run_workload_inspector_server:
+    value: false
+sample_rate:
+    value: 1
+save:
+    value: poziomka_5
+save_interval:
+    value: 1600
+scatter_gather_tensors_in_pipeline:
+    value: true
+seed:
+    value: 50
+seq_length:
+    value: 8192
+sequence_parallel:
+    value: true
+sft:
+    value: false
+sft_tokenizer_prompt_format:
+    value: nemotron-h-aligned
+sgd_momentum:
+    value: 0.9
+short_seq_prob:
+    value: 0.1
+skip_casting_dtype_for_param_pattern:
+    value: '["^expert_bias$|.+\.expert_bias$"]'
+skip_train:
+    value: false
+skipped_train_samples:
+    value: 0
+spec:
+    value: null
+split:
+    value: 9999,1,0
+squared_relu:
+    value: false
+start_weight_decay:
+    value: 0.1
+straggler_ctrlr_port:
+    value: 65535
+straggler_minmax_count:
+    value: 1
+suggested_communication_unit_size:
+    value: null
+swiglu:
+    value: true
+swin_backbone_type:
+    value: tiny
+symmetric_ar_type:
+    value: null
+te_rng_tracker:
+    value: false
+tensor_model_parallel_size:
+    value: 4
+tensorboard_dir:
+    value: poziomka_5/runs
+tensorboard_log_interval:
+    value: 1
+tensorboard_queue_size:
+    value: 1000
+test_data_path:
+    value: null
+test_mode:
+    value: false
+tiktoken_num_special_tokens:
+    value: 1000
+tiktoken_pattern:
+    value: null
+tiktoken_special_tokens:
+    value: null
+timing_log_level:
+    value: 0
+timing_log_option:
+    value: minmax
+titles_data_path:
+    value: null
+tokenizer_model:
+    value: /home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4
+tokenizer_type:
+    value: HuggingFaceTokenizer
+torch_fsdp2_reshard_after_forward:
+    value: true
+tp_comm_bootstrap_backend:
+    value: nccl
+tp_comm_bulk_dgrad:
+    value: true
+tp_comm_bulk_wgrad:
+    value: true
+tp_comm_overlap:
+    value: false
+tp_comm_overlap_ag:
+    value: true
+tp_comm_overlap_cfg:
+    value: null
+tp_comm_overlap_rs:
+    value: true
+tp_comm_overlap_rs_dgrad:
+    value: false
+tp_comm_split_ag:
+    value: true
+tp_comm_split_rs:
+    value: true
+train_data_path:
+    value: null
+train_iters:
+    value: 50000
+train_samples:
+    value: null
+train_sync_interval:
+    value: null
+transformer_impl:
+    value: transformer_engine
+transformer_pipeline_model_parallel_size:
+    value: 1
+untie_embeddings_and_output_weights:
+    value: true
+use_checkpoint_args:
+    value: false
+use_checkpoint_opt_param_scheduler:
+    value: false
+use_cpu_initialization:
+    value: null
+use_custom_fsdp:
+    value: false
+use_dist_ckpt:
+    value: true
+use_dist_ckpt_deprecated:
+    value: false
+use_distributed_optimizer:
+    value: false
+use_flash_attn:
+    value: true
+use_legacy_models:
+    value: false
+use_mp_args_from_checkpoint_args:
+    value: false
+use_one_sent_docs:
+    value: false
+use_persistent_ckpt_worker:
+    value: false
+use_precision_aware_optimizer:
+    value: false
+use_pytorch_profiler:
+    value: false
+use_ring_exchange_p2p:
+    value: false
+use_rope_scaling:
+    value: false
+use_rotary_position_embeddings:
+    value: false
+use_sharp:
+    value: false
+use_tokenizer_model_from_checkpoint_args:
+    value: true
+use_torch_fsdp2:
+    value: false
+use_torch_optimizer_for_cpu_offload:
+    value: false
+use_tp_pp_dp_mapping:
+    value: false
+v_head_dim:
+    value: 128
+valid_data_path:
+    value: null
+variable_seq_lengths:
+    value: false
+virtual_pipeline_model_parallel_size:
+    value: null
+vision_backbone_type:
+    value: vit
+vision_pretraining:
+    value: false
+vision_pretraining_type:
+    value: classify
+vocab_extra_ids:
+    value: 0
+vocab_file:
+    value: null
+vocab_size:
+    value: 32000
+wandb_exp_name:
+    value: poziomka_5
+wandb_project:
+    value: poziomka
+wandb_save_dir:
+    value: ""
+weight_decay:
+    value: 0.1
+weight_decay_incr_style:
+    value: constant
+wgrad_deferral_limit:
+    value: 0
+world_size:
+    value: 8
+yaml_cfg:
+    value: null

wandb/wandb/run-20250922_220405-hrldy3bw/files/output.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:184173ebbe892d0082a25f8f9c003dff5ca6df791cee3e7dfa216517dc2a9dbd
+size 15656694

wandb/wandb/run-20250922_220405-hrldy3bw/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,173 @@

+GitPython==3.1.45
+psutils==3.3.11
+networkx==3.3
+dill==0.4.0
+requests==2.32.5
+nvidia-cusparselt-cu12==0.7.1
+ml_dtypes==0.5.3
+pyarrow==21.0.0
+gitdb==4.0.12
+packaging==24.2
+pydantic_core==2.33.2
+torchvision==0.23.0+cu129
+mpmath==1.3.0
+nvidia-cusolver-cu12==11.7.5.82
+nvidia-cuda-runtime-cu12==12.9.79
+propcache==0.3.2
+psutil==7.1.0
+onnx-ir==0.1.9
+nvidia-cusparse-cu12==12.5.10.65
+aiohttp==3.12.15
+aiosignal==1.4.0
+protobuf==6.32.1
+apex==0.1
+torch==2.8.0+cu129
+nvidia-cublas-cu12==12.9.1.4
+frozenlist==1.7.0
+nvidia-cufile-cu12==1.14.1.1
+onnxscript==0.3.1
+smmap==5.0.2
+nvidia-cuda-nvrtc-cu12==12.9.86
+pandas==2.3.2
+platformdirs==4.4.0
+nvidia-nvjitlink-cu12==12.9.86
+pypdf==6.1.0
+puremagic==1.30
+regex==2025.9.18
+triton==3.4.0
+pip==25.2
+pydantic==2.11.9
+charset-normalizer==3.4.3
+nvidia-cufft-cu12==11.4.1.4
+urllib3==2.5.0
+nvidia-cudnn-cu12==9.10.2.21
+tzdata==2025.2
+wandb==0.22.0
+datasets==4.1.1
+huggingface-hub==0.35.0
+transformers==4.56.2
+tqdm==4.67.1
+megatron-core==0.13.0
+tiktoken==0.11.0
+hf_transfer==0.1.9
+multiprocess==0.70.16
+python-dateutil==2.9.0.post0
+multidict==6.6.4
+sentry-sdk==2.38.0
+aiohappyeyeballs==2.6.1
+onnx==1.19.0
+einops==0.8.1
+sympy==1.13.3
+setuptools==80.9.0
+pillow==11.0.0
+filelock==3.19.1
+hf-xet==1.1.10
+flash_attn_3==3.0.0b1
+ninja==1.13.0
+fsspec==2025.9.0
+nvidia-curand-cu12==10.3.10.19
+bitsandbytes==0.47.0
+nvidia-nccl-cu12==2.27.3
+typing-inspection==0.4.1
+xxhash==3.5.0
+numpy==1.26.4
+tokenizers==0.22.1
+typing_extensions==4.15.0
+safetensors==0.6.2
+annotated-types==0.7.0
+transformer_engine==2.6.0.post1
+nvidia-nvtx-cu12==12.9.79
+async-timeout==5.0.1
+transformer_engine_cu12==2.6.0.post1
+transformer_engine_torch==2.6.0.post1
+nvidia-cuda-cupti-cu12==12.9.79
+wheel==0.45.1
+yarl==1.20.1
+pybind11==3.0.1
+python-debian==0.1.43+ubuntu1.1
+SecretStorage==3.3.1
+lazr.restfulclient==0.14.4
+pytz==2022.1
+attrs==21.2.0
+zope.interface==5.4.0
+chardet==4.0.0
+pyasn1-modules==0.2.1
+setuptools==59.6.0
+Jinja2==3.0.3
+pyasn1==0.4.8
+netifaces==0.11.0
+ubuntu-drivers-common==0.0.0
+click==8.0.3
+dbus-python==1.2.18
+pyserial==3.5
+python-apt==2.4.0+ubuntu4
+PyJWT==2.3.0
+oauthlib==3.2.0
+bcrypt==3.2.0
+python-magic==0.4.24
+xkit==0.0.0
+constantly==15.1.0
+blinker==1.4
+PyYAML==5.4.1
+distro-info==1.1+ubuntu0.2
+lazr.uri==1.0.6
+distro==1.7.0
+pexpect==4.8.0
+PyGObject==3.42.1
+ssh-import-id==5.11
+cryptography==3.4.8
+certifi==2020.6.20
+service-identity==18.1.0
+cloud-init==25.1.2
+keyring==23.5.0
+jeepney==0.7.1
+colorama==0.4.4
+idna==3.3
+MarkupSafe==2.0.1
+pip==22.0.2
+ptyprocess==0.7.0
+configobj==5.0.6
+hyperlink==21.0.0
+pyparsing==2.4.7
+ufw==0.36.1
+pyrsistent==0.18.1
+httplib2==0.20.2
+sos==4.8.2
+unattended-upgrades==0.1
+requests==2.25.1
+ubuntu-pro-client==8001
+launchpadlib==1.10.16
+six==1.16.0
+urllib3==1.26.5
+systemd-python==234
+importlib-metadata==4.6.4
+command-not-found==0.3
+jsonschema==3.2.0
+Automat==20.2.0
+more-itertools==8.10.0
+PyHamcrest==2.0.2
+incremental==21.3.0
+zipp==1.0.0
+jsonpointer==2.0
+Twisted==22.1.0
+pyOpenSSL==21.0.0
+wadllib==1.3.6
+Babel==2.8.0
+jsonpatch==1.32
+wheel==0.37.1
+platformdirs==4.2.2
+typing_extensions==4.12.2
+packaging==24.2
+tomli==2.0.1
+inflect==7.3.1
+jaraco.context==5.3.0
+backports.tarfile==1.2.0
+autocommand==2.2.2
+importlib_metadata==8.0.0
+more-itertools==10.3.0
+jaraco.functools==4.0.1
+typeguard==4.3.0
+zipp==3.19.2
+jaraco.collections==5.1.0
+jaraco.text==3.12.1
+wheel==0.45.1

wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-metadata.json ADDED Viewed

	@@ -0,0 +1,248 @@

+{
+  "os":  "Linux-5.15.0-143-generic-x86_64-with-glibc2.35",
+  "python":  "CPython 3.10.12",
+  "startedAt":  "2025-09-22T22:04:05.928052Z",
+  "args":  [
+    "--expert-model-parallel-size",
+    "2",
+    "--expert-tensor-parallel-size",
+    "1",
+    "--moe-grouped-gemm",
+    "--moe-token-dispatcher-type",
+    "alltoall",
+    "--moe-router-dtype",
+    "fp32",
+    "--num-experts",
+    "128",
+    "--moe-ffn-hidden-size",
+    "320",
+    "--moe-shared-expert-intermediate-size",
+    "320",
+    "--moe-router-score-function",
+    "sigmoid",
+    "--moe-router-topk",
+    "4",
+    "--moe-router-enable-expert-bias",
+    "--moe-router-topk-scaling-factor",
+    "2.5",
+    "--moe-router-num-groups",
+    "8",
+    "--moe-router-group-topk",
+    "2",
+    "--moe-z-loss-coeff",
+    "0.0000035",
+    "--moe-router-bias-update-rate",
+    "1e-3",
+    "--moe-layer-freq",
+    "[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]",
+    "--bias-zero-mean-update",
+    "--moe-expert-capacity-factor",
+    "1.25",
+    "--moe-pad-expert-input-to-capacity",
+    "--moe-shared-expert-overlap",
+    "--num-layers",
+    "16",
+    "--hidden-size",
+    "2048",
+    "--ffn-hidden-size",
+    "2048",
+    "--num-attention-heads",
+    "16",
+    "--num-query-groups",
+    "4",
+    "--group-query-attention",
+    "--qk-layernorm",
+    "--use-flash-attn",
+    "--max-position-embeddings",
+    "8192",
+    "--vocab-size",
+    "32000",
+    "--make-vocab-size-divisible-by",
+    "128",
+    "--position-embedding-type",
+    "rope",
+    "--rotary-base",
+    "84000",
+    "--rotary-percent",
+    "0.5",
+    "--rotary-scaling-factor",
+    "40",
+    "--swiglu",
+    "--untie-embeddings-and-output-weights",
+    "--normalization",
+    "RMSNorm",
+    "--norm-epsilon",
+    "1e-06",
+    "--disable-bias-linear",
+    "--transformer-impl",
+    "transformer_engine",
+    "--attention-dropout",
+    "0",
+    "--hidden-dropout",
+    "0",
+    "--micro-batch-size",
+    "8",
+    "--global-batch-size",
+    "256",
+    "--seq-length",
+    "8192",
+    "--train-iters",
+    "50000",
+    "--weight-decay",
+    "0.1",
+    "--adam-beta1",
+    "0.9",
+    "--adam-beta2",
+    "0.95",
+    "--init-method-std",
+    "0.02",
+    "--clip-grad",
+    "1.0",
+    "--bf16",
+    "--optimizer",
+    "adam",
+    "--lr",
+    "8.0e-4",
+    "--lr-decay-style",
+    "cosine",
+    "--min-lr",
+    "4.00e-5",
+    "--lr-warmup-iters",
+    "100",
+    "--seed",
+    "50",
+    "--pipeline-model-parallel-size",
+    "1",
+    "--tensor-model-parallel-size",
+    "4",
+    "--sequence-parallel",
+    "--overlap-grad-reduce",
+    "--data-path",
+    "szypulka_tokenized_apt4_merged/apt4_merged_text_document",
+    "--tokenizer-type",
+    "HuggingFaceTokenizer",
+    "--tokenizer-model",
+    "/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4",
+    "--split",
+    "9999,1,0",
+    "--dataloader-type",
+    "single",
+    "--no-create-attention-mask-in-dataloader",
+    "--eod-mask-loss",
+    "--save-interval",
+    "1600",
+    "--eval-interval",
+    "1600",
+    "--eval-iters",
+    "2",
+    "--save",
+    "poziomka_5",
+    "--ckpt-format",
+    "torch_dist",
+    "--async-save",
+    "--log-interval",
+    "1",
+    "--log-throughput",
+    "--tensorboard-dir",
+    "poziomka_5/runs",
+    "--log-timers-to-tensorboard",
+    "--log-memory-to-tensorboard",
+    "--log-world-size-to-tensorboard",
+    "--log-validation-ppl-to-tensorboard",
+    "--wandb-project",
+    "poziomka",
+    "--wandb-exp-name",
+    "poziomka_5",
+    "--attention-backend",
+    "flash",
+    "--no-masked-softmax-fusion",
+    "--attention-softmax-in-fp32",
+    "--cross-entropy-loss-fusion",
+    "--mtp-num-layers",
+    "0"
+  ],
+  "program":  "/home/ubuntu/training/Ling-V2/Megatron-LM-core_v0.13.0/pretrain_gpt.py",
+  "codePath":  "Megatron-LM-core_v0.13.0/pretrain_gpt.py",
+  "codePathLocal":  "Megatron-LM-core_v0.13.0/pretrain_gpt.py",
+  "git":  {
+    "remote":  "https://github.com/adamo1139/Ling-V2.git",
+    "commit":  "e3867293ebf444f614164c2b84180cd75e7de07c"
+  },
+  "email":  "adamo1139@gmail.com",
+  "root":  "poziomka_5/wandb",
+  "host":  "megatron6",
+  "executable":  "/usr/bin/python3",
+  "cpu_count":  128,
+  "cpu_count_logical":  128,
+  "gpu":  "NVIDIA H100 80GB HBM3",
+  "gpu_count":  8,
+  "disk":  {
+    "/":  {
+      "total":  "2907329073152",
+      "used":  "390033698816"
+    }
+  },
+  "memory":  {
+    "total":  "1014522519552"
+  },
+  "gpu_nvidia":  [
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-b4eb54a1-d73f-9179-04f2-b231b6a39a34"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-742b8534-6865-3da0-d864-a822a5d5d629"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-19d31f38-1be1-eced-5b78-b0d3f4deae56"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-1b41c967-636a-e3f1-5d74-2da616c06a3e"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-51d2a37c-3157-7b61-73b3-4a7914884549"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-499c302d-fc3c-c679-61e7-a2c4d46b8449"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-8eac440c-b327-20e6-1809-ef3e549d6c6d"
+    },
+    {
+      "name":  "NVIDIA H100 80GB HBM3",
+      "memoryTotal":  "85520809984",
+      "cudaCores":  16896,
+      "architecture":  "Hopper",
+      "uuid":  "GPU-5eba09b7-f1fc-555e-d07c-7b6227264759"
+    }
+  ],
+  "cudaVersion":  "12.8",
+  "writerId":  "9vte3cwjfuxykvlnatinaorhdm7hrpxl"
+}

wandb/wandb/run-20250922_220405-hrldy3bw/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"_timestamp":1.7588892349026182e+09,"throughput":142.40106310791458,"_wandb":{"runtime":310590},"_runtime":310590.549572079,"_step":43729,"iteration-time":7.31538462638855}

wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-core.log ADDED Viewed

	@@ -0,0 +1,11 @@

+{"time":"2025-09-22T22:04:05.949917481Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpvwq5pw5s/port-40865.txt","pid":40865,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-09-22T22:04:05.951064803Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":40865}
+{"time":"2025-09-22T22:04:05.951060304Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-40865-41784-466403699/socket","Net":"unix"}}
+{"time":"2025-09-22T22:04:06.13426601Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2025-09-22T22:04:06.143173217Z","level":"INFO","msg":"handleInformInit: received","streamId":"hrldy3bw","id":"1(@)"}
+{"time":"2025-09-22T22:04:06.487650443Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hrldy3bw","id":"1(@)"}
+{"time":"2025-09-26T12:20:37.296353193Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
+{"time":"2025-09-26T12:20:37.296955718Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
+{"time":"2025-09-26T12:20:37.297053401Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
+{"time":"2025-09-26T12:20:37.297072505Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-09-26T12:20:37.297624126Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-40865-41784-466403699/socket","Net":"unix"}}

wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log ADDED Viewed

	@@ -0,0 +1,19 @@

+{"time":"2025-09-22T22:04:06.143255097Z","level":"INFO","msg":"stream: starting","core version":"0.22.0"}
+{"time":"2025-09-22T22:04:06.487372274Z","level":"INFO","msg":"stream: created new stream","id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487645224Z","level":"INFO","msg":"stream: started","id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487690464Z","level":"INFO","msg":"sender: started","stream_id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487691275Z","level":"INFO","msg":"writer: started","stream_id":"hrldy3bw"}
+{"time":"2025-09-22T22:04:06.487752261Z","level":"INFO","msg":"handler: started","stream_id":"hrldy3bw"}
+{"time":"2025-09-23T20:24:34.768930029Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-24T15:33:04.643961764Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded"}
+{"time":"2025-09-24T15:48:05.092383968Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": net/http: request canceled (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-24T16:41:14.895690245Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/graphql","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-25T16:26:04.894084919Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-25T17:26:05.511033911Z","level":"INFO","msg":"api: retrying error","error":"Post \"https://api.wandb.ai/graphql\": context deadline exceeded (Client.Timeout exceeded while awaiting headers)"}
+{"time":"2025-09-25T22:42:38.263384097Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-25T23:02:46.550701182Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/adamo1139-no/poziomka/hrldy3bw/file_stream","body":"\n<html><head>\n<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">\n<title>502 Server Error</title>\n</head>\n<body text=#000000 bgcolor=#ffffff>\n<h1>Error: Server Error</h1>\n<h2>The server encountered a temporary error and could not complete your request.<p>Please try again in 30 seconds.</h2>\n<h2></h2>\n</body></html>\n"}
+{"time":"2025-09-26T12:20:37.296973248Z","level":"INFO","msg":"stream: closing","id":"hrldy3bw"}
+{"time":"2025-09-26T12:20:38.848817943Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
+{"time":"2025-09-26T12:20:39.03451843Z","level":"INFO","msg":"handler: closed","stream_id":"hrldy3bw"}
+{"time":"2025-09-26T12:20:39.034655715Z","level":"INFO","msg":"sender: closed","stream_id":"hrldy3bw"}
+{"time":"2025-09-26T12:20:39.034688219Z","level":"INFO","msg":"stream: closed","id":"hrldy3bw"}

wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log ADDED Viewed

	@@ -0,0 +1,23 @@

+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Current SDK version is 0.22.0
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Configure stats pid to 40865
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /root/.config/wandb/settings
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from /home/ubuntu/training/Ling-V2/wandb/settings
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_setup.py:_flush():81] Loading settings from environment variables
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:setup_run_log_directory():686] Logging user logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug.log
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:setup_run_log_directory():687] Logging internal logs to poziomka_5/wandb/wandb/run-20250922_220405-hrldy3bw/logs/debug-internal.log
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:init():813] calling init triggers
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:init():818] wandb.init called with sweep_config: {}
+config: {'num_layers': 16, 'encoder_num_layers': 16, 'decoder_num_layers': None, 'hidden_size': 2048, 'ffn_hidden_size': 2048, 'num_attention_heads': 16, 'attention_backend': <AttnBackend.flash: 1>, 'kv_channels': 128, 'group_query_attention': True, 'num_query_groups': 4, 'max_position_embeddings': 8192, 'position_embedding_type': 'rope', 'relative_attention_num_buckets': 32, 'relative_attention_max_distance': 128, 'use_rotary_position_embeddings': False, 'rotary_base': 84000, 'rotary_percent': 0.5, 'rotary_interleaved': False, 'rotary_seq_len_interpolation_factor': None, 'use_rope_scaling': False, 'rope_scaling_factor': 8.0, 'no_rope_freq': None, 'add_position_embedding': True, 'mrope_section': None, 'make_vocab_size_divisible_by': 128, 'normalization': 'RMSNorm', 'norm_epsilon': 1e-06, 'apply_layernorm_1p': False, 'apply_residual_connection_post_layernorm': False, 'openai_gelu': False, 'squared_relu': False, 'swiglu': True, 'onnx_safe': None, 'bert_binary_head': True, 'untie_embeddings_and_output_weights': True, 'multi_latent_attention': False, 'mtp_num_layers': 0, 'mtp_loss_scaling_factor': 0.1, 'bias_zero_mean_update': True, 'attention_dropout': 0.0, 'hidden_dropout': 0.0, 'weight_decay': 0.1, 'start_weight_decay': 0.1, 'end_weight_decay': 0.1, 'weight_decay_incr_style': 'constant', 'clip_grad': 1.0, 'adam_beta1': 0.9, 'adam_beta2': 0.95, 'adam_eps': 1e-08, 'sgd_momentum': 0.9, 'micro_batch_size': 8, 'global_batch_size': 256, 'rampup_batch_size': None, 'decrease_batch_size_if_needed': False, 'recompute_granularity': None, 'check_for_nan_in_loss_and_grad': True, 'check_for_spiky_loss': False, 'check_for_large_grads': False, 'distribute_saved_activations': False, 'recompute_method': None, 'recompute_num_layers': None, 'recompute_modules': None, 'clone_scatter_output_in_embedding': True, 'profile': False, 'profile_step_start': 10, 'profile_step_end': 12, 'iterations_to_skip': [], 'result_rejected_tracker_filename': None, 'enable_gloo_process_groups': True, 'use_pytorch_profiler': False, 'profile_ranks': [0], 'record_memory_history': False, 'memory_snapshot_path': 'snapshot.pickle', 'tp_comm_overlap': False, 'tp_comm_overlap_cfg': None, 'tp_comm_overlap_ag': True, 'tp_comm_overlap_rs': True, 'tp_comm_overlap_rs_dgrad': False, 'tp_comm_bulk_dgrad': True, 'tp_comm_bulk_wgrad': True, 'tp_comm_bootstrap_backend': 'nccl', 'use_cpu_initialization': None, 'empty_unused_memory_level': 0, 'deterministic_mode': False, 'check_weight_hash_across_dp_replicas_interval': None, 'calculate_per_token_loss': False, 'train_sync_interval': None, 'train_iters': 50000, 'train_samples': None, 'log_interval': 1, 'exit_interval': None, 'exit_duration_in_mins': None, 'exit_signal_handler': False, 'tensorboard_dir': 'poziomka_5/runs', 'masked_softmax_fusion': False, 'bias_gelu_fusion': False, 'bias_swiglu_fusion': True, 'bias_dropout_fusion': True, 'apply_rope_fusion': True, 'cross_entropy_loss_fusion': True, 'cross_entropy_fusion_impl': 'native', 'use_flash_attn': True, 'add_bias_linear': False, 'add_qkv_bias': False, 'optimizer': 'adam', 'optimizer_cpu_offload': False, 'optimizer_offload_fraction': 1.0, 'use_torch_optimizer_for_cpu_offload': False, 'overlap_cpu_optimizer_d2h_h2d': False, 'pin_cpu_grads': True, 'pin_cpu_params': True, 'dataloader_type': 'single', 'async_tensor_model_parallel_allreduce': True, 'no_persist_layer_norm': False, 'sequence_parallel': True, 'gradient_accumulation_fusion': True, 'deprecated_use_mcore_models': False, 'use_legacy_models': False, 'manual_gc': False, 'manual_gc_interval': 0, 'manual_gc_eval': True, 'tp_comm_split_ag': True, 'tp_comm_split_rs': True, 'pipeline_model_parallel_comm_backend': None, 'high_priority_stream_groups': [], 'seed': 50, 'data_parallel_random_init': False, 'init_method_std': 0.02, 'init_method_xavier_uniform': False, 'lr': 0.0008, 'lr_decay_style': 'cosine', 'lr_wsd_decay_style': 'exponential', 'lr_decay_iters': None, 'lr_decay_samples': None, 'lr_wsd_decay_samples': None, 'lr_wsd_decay_iters': None, 'lr_warmup_fraction': None, 'lr_warmup_iters': 100, 'lr_warmup_samples': 0, 'lr_warmup_init': 0.0, 'min_lr': 4e-05, 'override_opt_param_scheduler': False, 'use_checkpoint_opt_param_scheduler': False, 'decoupled_lr': None, 'decoupled_min_lr': None, 'save': 'poziomka_5', 'save_interval': 1600, 'no_save_optim': None, 'no_save_rng': None, 'load': None, 'no_load_optim': None, 'no_load_rng': None, 'non_persistent_save_interval': None, 'non_persistent_ckpt_type': None, 'non_persistent_global_ckpt_dir': None, 'non_persistent_local_ckpt_dir': None, 'non_persistent_local_ckpt_algo': 'fully_parallel', 'finetune': False, 'pretrained_checkpoint': None, 'ckpt_step': None, 'perform_initialization': True, 'use_checkpoint_args': False, 'use_mp_args_from_checkpoint_args': False, 'use_tokenizer_model_from_checkpoint_args': True, 'exit_on_missing_checkpoint': False, 'use_dist_ckpt_deprecated': False, 'use_persistent_ckpt_worker': False, 'auto_detect_ckpt_format': False, 'dist_ckpt_format_deprecated': None, 'ckpt_format': 'torch_dist', 'ckpt_convert_format': None, 'ckpt_convert_save': None, 'ckpt_convert_update_legacy_dist_opt_format': False, 'ckpt_fully_parallel_save_deprecated': False, 'ckpt_fully_parallel_save': True, 'async_save': True, 'ckpt_fully_parallel_load': False, 'ckpt_assume_constant_structure': False, 'dist_ckpt_strictness': 'assume_ok_unexpected', 'load_model_opt_format': False, 'fp16': False, 'bf16': True, 'grad_reduce_in_bf16': False, 'loss_scale': None, 'initial_loss_scale': 4294967296, 'min_loss_scale': 1.0, 'loss_scale_window': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'apply_query_key_layer_scaling': False, 'attention_softmax_in_fp32': True, 'accumulate_allreduce_grads_in_fp32': True, 'fp16_lm_cross_entropy': False, 'disable_bf16_reduced_precision_matmul': False, 'reuse_grad_buf_for_mxfp8_param_ag': False, 'tensor_model_parallel_size': 4, 'encoder_tensor_model_parallel_size': 0, 'pipeline_model_parallel_size': 1, 'encoder_pipeline_model_parallel_size': 0, 'pipeline_model_parallel_split_rank': None, 'decoder_first_pipeline_num_layers': None, 'decoder_last_pipeline_num_layers': None, 'pipeline_model_parallel_layout': None, 'num_layers_per_virtual_pipeline_stage': None, 'num_virtual_stages_per_pipeline_rank': None, 'microbatch_group_size_per_vp_stage': None, 'overlap_p2p_comm': False, 'overlap_p2p_comm_warmup_flush': False, 'distributed_backend': 'nccl', 'distributed_timeout_minutes': 10, 'overlap_grad_reduce': True, 'defer_embedding_wgrad_compute': False, 'wgrad_deferral_limit': 0, 'align_grad_reduce': True, 'ddp_num_buckets': None, 'ddp_bucket_size': None, 'ddp_pad_buckets_for_high_nccl_busbw': False, 'ddp_average_in_collective': False, 'overlap_param_gather': False, 'overlap_param_gather_with_optimizer_step': False, 'align_param_gather': False, 'scatter_gather_tensors_in_pipeline': True, 'use_ring_exchange_p2p': False, 'local_rank': 7, 'lazy_mpu_init': None, 'account_for_embedding_in_pipeline_split': False, 'account_for_loss_in_pipeline_split': False, 'use_distributed_optimizer': False, 'nccl_ub': False, 'use_sharp': False, 'use_custom_fsdp': False, 'init_model_with_meta_device': False, 'data_parallel_sharding_strategy': 'no_shard', 'gradient_reduce_div_fusion': True, 'fsdp_double_buffer': False, 'suggested_communication_unit_size': None, 'keep_fp8_transpose_cache_when_using_custom_fsdp': False, 'num_distributed_optimizer_instances': 1, 'use_torch_fsdp2': False, 'torch_fsdp2_reshard_after_forward': True, 'context_parallel_size': 1, 'cp_comm_type': ['p2p'], 'hierarchical_context_parallel_sizes': None, 'nccl_communicator_config_path': None, 'use_tp_pp_dp_mapping': False, 'replication': False, 'replication_jump': None, 'replication_factor': 2, 'eval_iters': 2, 'eval_interval': 1600, 'test_mode': False, 'skip_train': False, 'data_path': ['szypulka_tokenized_apt4_merged/apt4_merged_text_document'], 'split': '9999,1,0', 'train_data_path': None, 'valid_data_path': None, 'test_data_path': None, 'data_args_path': None, 'per_split_data_args_path': None, 'data_cache_path': None, 'mmap_bin_files': True, 'mock_data': False, 'seq_length': 8192, 'encoder_seq_length': 8192, 'decoder_seq_length': None, 'retriever_seq_length': 256, 'sample_rate': 1.0, 'mask_prob': 0.15, 'short_seq_prob': 0.1, 'num_workers': 2, 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': True, 'create_attention_mask_in_dataloader': False, 'num_dataset_builder_threads': 1, 'object_storage_cache_path': None, 'mid_level_dataset_surplus': 0.005, 'vocab_size': 32000, 'vocab_file': None, 'merge_file': None, 'vocab_extra_ids': 0, 'tokenizer_type': 'HuggingFaceTokenizer', 'tokenizer_model': '/home/ubuntu/training/Ling-V2/examples/pretrain/../../resource/tokenizer/apt4', 'tiktoken_pattern': None, 'tiktoken_num_special_tokens': 1000, 'tiktoken_special_tokens': None, 'adlr_autoresume': False, 'adlr_autoresume_interval': 1000, 'ict_head_size': None, 'biencoder_projection_dim': 0, 'biencoder_shared_query_context_model': False, 'ict_load': None, 'bert_load': None, 'titles_data_path': None, 'query_in_block_prob': 0.1, 'use_one_sent_docs': False, 'evidence_data_path': None, 'retriever_report_topk_accuracies': [], 'retriever_score_scaling': False, 'block_data_path': None, 'embedding_path': None, 'indexer_batch_size': 128, 'indexer_log_interval': 1000, 'num_classes': 1000, 'img_h': 224, 'img_w': 224, 'num_channels': 3, 'patch_dim': 16, 'classes_fraction': 1.0, 'data_per_class_fraction': 1.0, 'data_sharding': True, 'head_lr_mult': 1.0, 'vision_pretraining': False, 'vision_pretraining_type': 'classify', 'vision_backbone_type': 'vit', 'swin_backbone_type': 'tiny', 'mask_type': 'random', 'mask_factor': 1.0, 'iter_per_epoch': 1250, 'dino_local_img_size': 96, 'dino_local_crops_number': 10, 'dino_head_hidden_size': 2048, 'dino_bottleneck_size': 256, 'dino_freeze_last_layer': 1, 'dino_norm_last_layer': False, 'dino_warmup_teacher_temp': 0.04, 'dino_teacher_temp': 0.07, 'dino_warmup_teacher_temp_epochs': 30, 'qk_layernorm': True, 'qk_l2_norm': False, 'expert_model_parallel_size': 2, 'expert_tensor_parallel_size': 1, 'num_experts': 128, 'moe_layer_freq': [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'moe_ffn_hidden_size': 320, 'moe_shared_expert_intermediate_size': 320, 'moe_shared_expert_overlap': True, 'moe_grouped_gemm': True, 'moe_use_legacy_grouped_gemm': False, 'moe_layer_recompute': False, 'moe_extended_tp': False, 'moe_use_upcycling': False, 'moe_router_load_balancing_type': 'aux_loss', 'moe_router_dtype': 'fp32', 'skip_casting_dtype_for_param_pattern': '["^expert_bias$|.+\\.expert_bias$"]', 'moe_router_score_function': 'sigmoid', 'moe_router_topk': 4, 'moe_router_pre_softmax': False, 'moe_router_num_groups': 8, 'moe_router_group_topk': 2, 'moe_router_topk_scaling_factor': 2.5, 'moe_router_enable_expert_bias': True, 'moe_router_bias_update_rate': 0.001, 'moe_router_force_load_balancing': False, 'moe_router_padding_for_fp8': False, 'moe_aux_loss_coeff': 0.0, 'moe_z_loss_coeff': 3.5e-06, 'moe_input_jitter_eps': None, 'moe_per_layer_logging': False, 'moe_token_dispatcher_type': 'alltoall', 'moe_enable_deepep': False, 'moe_deepep_num_sms': 20, 'moe_permute_fusion': False, 'moe_expert_capacity_factor': 1.25, 'moe_pad_expert_input_to_capacity': True, 'moe_token_drop_policy': 'probs', 'moe_apply_probs_on_input': False, 'delay_wgrad_compute': False, 'moe_upcycling_granularity': 1, 'q_lora_rank': None, 'kv_lora_rank': 32, 'qk_head_dim': 128, 'qk_pos_emb_head_dim': 64, 'v_head_dim': 128, 'rotary_scaling_factor': 40.0, 'mscale': 1.0, 'mscale_all_dim': 1.0, 'heterogeneous_layers_config_path': None, 'heterogeneous_layers_config_encoded_json': None, 'log_params_norm': False, 'log_num_zeros_in_grad': False, 'log_throughput': True, 'log_progress': False, 'timing_log_level': 0, 'log_energy': False, 'barrier_with_L1_time': True, 'timing_log_option': 'minmax', 'tensorboard_log_interval': 1, 'tensorboard_queue_size': 1000, 'log_timers_to_tensorboard': True, 'log_loss_scale_to_tensorboard': True, 'log_validation_ppl_to_tensorboard': True, 'log_memory_to_tensorboard': True, 'log_world_size_to_tensorboard': True, 'wandb_project': 'poziomka', 'wandb_exp_name': 'poziomka_5', 'wandb_save_dir': '', 'logging_level': None, 'log_straggler': False, 'disable_straggler_on_startup': False, 'straggler_ctrlr_port': 65535, 'straggler_minmax_count': 1, 'run_workload_inspector_server': False, 'inference_batch_times_seqlen_threshold': -1, 'max_tokens_to_oom': 12000, 'output_bert_embeddings': False, 'bert_embedder_type': 'megatron', 'flash_decode': False, 'enable_cuda_graph': False, 'cuda_graph_warmup_steps': 3, 'external_cuda_graph': False, 'cuda_graph_scope': 'full', 'inference_max_batch_size': 8, 'inference_max_seq_length': 2560, 'inference_dynamic_batching': False, 'inference_dynamic_batching_buffer_size_gb': 40.0, 'inference_dynamic_batching_chunk_size': 256, 'inference_dynamic_batching_buffer_guaranteed_fraction': 0.2, 'inference_dynamic_batching_buffer_overflow_factor': None, 'inference_dynamic_batching_max_requests_override': None, 'inference_dynamic_batching_max_tokens_override': None, 'symmetric_ar_type': None, 'nccl_all_reduce_for_prefill': False, 'mlp_chunks_for_prefill': 1, 'fp8': None, 'fp8_recipe': 'delayed', 'fp8_margin': 0, 'fp8_interval': 1, 'fp8_amax_history_len': 1, 'fp8_amax_compute_algo': 'most_recent', 'fp8_wgrad': True, 'transformer_impl': 'transformer_engine', 'fp8_param_gather': False, 'first_last_layers_bf16': False, 'num_layers_at_start_in_bf16': 1, 'num_layers_at_end_in_bf16': 1, 'te_rng_tracker': False, 'inference_rng_tracker': False, 'retro_project_dir': None, 'retro_add_retriever': False, 'retro_cyclic_train_iters': None, 'retro_encoder_layers': 2, 'retro_encoder_hidden_dropout': 0.1, 'retro_encoder_attention_dropout': 0.1, 'retro_num_neighbors': 2, 'retro_num_retrieved_chunks': 2, 'retro_attention_gate': 1, 'retro_verify_neighbor_count': True, 'enable_experimental': False, 'spec': None, 'hybrid_attention_ratio': 0.0, 'hybrid_mlp_ratio': 0.0, 'hybrid_override_pattern': None, 'mamba_state_dim': 128, 'mamba_head_dim': 64, 'mamba_num_groups': 8, 'mamba_num_heads': None, 'is_hybrid_model': False, 'disable_mamba_mem_eff_path': False, 'yaml_cfg': None, 'use_precision_aware_optimizer': False, 'main_grads_dtype': torch.float32, 'main_params_dtype': torch.float32, 'exp_avg_dtype': torch.float32, 'exp_avg_sq_dtype': torch.float32, 'enable_one_logger': True, 'one_logger_project': 'megatron-lm', 'one_logger_run_name': None, 'one_logger_async': False, 'app_tag_run_name': None, 'app_tag_run_version': '0.0.0', 'inprocess_restart': False, 'inprocess_max_iterations': None, 'inprocess_monitor_thread_interval': 1.0, 'inprocess_monitor_process_interval': 1.0, 'inprocess_progress_watchdog_interval': 1.0, 'inprocess_heartbeat_interval': 30, 'inprocess_soft_timeout': 60, 'inprocess_hard_timeout': 90, 'inprocess_heartbeat_timeout': 60, 'inprocess_barrier_timeout': 120, 'inprocess_completion_timeout': 120, 'inprocess_last_call_wait': 1, 'inprocess_termination_grace_time': 1, 'inprocess_granularity': 'node', 'inprocess_active_world_size': 8, 'inprocess_empty_cuda_cache': False, 'enable_ft_package': False, 'calc_ft_timeouts': False, 'config_logger_dir': '', 'error_injection_rate': 0, 'error_injection_type': 'transient_error', 'rerun_mode': 'disabled', 'enable_msc': True, 'kitchen_config_file': None, 'kitchen_recipe_number': None, 'sft': False, 'sft_tokenizer_prompt_format': 'nemotron-h-aligned', 'rank': 7, 'world_size': 8, 'use_dist_ckpt': True, 'transformer_pipeline_model_parallel_size': 1, 'data_parallel_size': 2, 'virtual_pipeline_model_parallel_size': None, 'params_dtype': torch.bfloat16, 'consumed_train_samples': 0, 'skipped_train_samples': 0, 'consumed_valid_samples': 0, 'variable_seq_lengths': False, 'padded_vocab_size': 32256, '_wandb': {}}
+2025-09-22 22:04:05,929 INFO    MainThread:40865 [wandb_init.py:init():861] starting backend
+2025-09-22 22:04:06,134 INFO    MainThread:40865 [wandb_init.py:init():864] sending inform_init request
+2025-09-22 22:04:06,137 INFO    MainThread:40865 [wandb_init.py:init():872] backend started and connected
+2025-09-22 22:04:06,140 INFO    MainThread:40865 [wandb_init.py:init():942] updated telemetry
+2025-09-22 22:04:06,144 INFO    MainThread:40865 [wandb_init.py:init():966] communicating run to backend with 90.0 second timeout
+2025-09-22 22:04:06,744 INFO    MainThread:40865 [wandb_init.py:init():1017] starting run threads in backend
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_console_start():2506] atexit reg
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_redirect():2354] redirect: wrap_raw
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_redirect():2423] Wrapping output streams.
+2025-09-22 22:04:06,836 INFO    MainThread:40865 [wandb_run.py:_redirect():2446] Redirects installed.
+2025-09-22 22:04:06,838 INFO    MainThread:40865 [wandb_init.py:init():1057] run started, returning control to user process
+2025-09-26 12:20:37,273 INFO    wandb-AsyncioManager-main:40865 [service_client.py:_forward_responses():84] Reached EOF.
+2025-09-26 12:20:37,275 INFO    wandb-AsyncioManager-main:40865 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.

wandb/wandb/run-20250922_220405-hrldy3bw/run-hrldy3bw.wandb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b28ec29fb9c8346c89c54dad96553569524449851aa8e3717d5c6f3b593f0eeb
+size 112983277