Add 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255
Browse files- .gitattributes +1 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/config.json +31 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/model.safetensors +3 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/optimizer.pt +3 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/pytorch_model.bin +3 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/training_state.json +7 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255.txt +0 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb.json +3 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/files/requirements.txt +142 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug-internal.log +15 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug.log +22 -0
- 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/run-8ks57nb9.wandb +3 -0
.gitattributes
CHANGED
|
@@ -125,3 +125,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 125 |
1b/shampoo_lr1e_2_b1_0_9_b2_0_999_eps_1e_8_A100_ppl_14_2854_20260421_102432/wandb/offline-run-20260421_102458-capz4d8i/run-capz4d8i.wandb filter=lfs diff=lfs merge=lfs -text
|
| 126 |
1b/mars_lion_lr2e_4_b1_0_9_b2_0_98_eps_1e_8_A100_ppl_15_7263_20260420_132323/wandb/offline-run-20260420_132356-glmzsqdq/run-glmzsqdq.wandb filter=lfs diff=lfs merge=lfs -text
|
| 127 |
1b/muon_lr6e_3_b1_0_9_b2_0_95_eps_1e_8_A100_ppl_13_7270_20260420_132324/wandb/offline-run-20260420_132357-0qywxs3n/run-0qywxs3n.wandb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 125 |
1b/shampoo_lr1e_2_b1_0_9_b2_0_999_eps_1e_8_A100_ppl_14_2854_20260421_102432/wandb/offline-run-20260421_102458-capz4d8i/run-capz4d8i.wandb filter=lfs diff=lfs merge=lfs -text
|
| 126 |
1b/mars_lion_lr2e_4_b1_0_9_b2_0_98_eps_1e_8_A100_ppl_15_7263_20260420_132323/wandb/offline-run-20260420_132356-glmzsqdq/run-glmzsqdq.wandb filter=lfs diff=lfs merge=lfs -text
|
| 127 |
1b/muon_lr6e_3_b1_0_9_b2_0_95_eps_1e_8_A100_ppl_13_7270_20260420_132324/wandb/offline-run-20260420_132357-0qywxs3n/run-0qywxs3n.wandb filter=lfs diff=lfs merge=lfs -text
|
| 128 |
+
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/run-8ks57nb9.wandb filter=lfs diff=lfs merge=lfs -text
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/config.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"dtype": "bfloat16",
|
| 9 |
+
"eos_token_id": 1,
|
| 10 |
+
"head_dim": 64,
|
| 11 |
+
"hidden_act": "silu",
|
| 12 |
+
"hidden_size": 2048,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 5461,
|
| 15 |
+
"max_position_embeddings": 2048,
|
| 16 |
+
"max_sequence_length": 1024,
|
| 17 |
+
"mlp_bias": false,
|
| 18 |
+
"model_type": "llama",
|
| 19 |
+
"num_attention_heads": 32,
|
| 20 |
+
"num_hidden_layers": 24,
|
| 21 |
+
"num_key_value_heads": 32,
|
| 22 |
+
"pad_token_id": -1,
|
| 23 |
+
"pretraining_tp": 1,
|
| 24 |
+
"rms_norm_eps": 1e-06,
|
| 25 |
+
"rope_scaling": null,
|
| 26 |
+
"rope_theta": 10000.0,
|
| 27 |
+
"tie_word_embeddings": false,
|
| 28 |
+
"transformers_version": "4.57.3",
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"vocab_size": 32000
|
| 31 |
+
}
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40d2e333531014a8c861a0868e5462d78498b534c2cfc95215ca7a5a06e49d27
|
| 3 |
+
size 2678195080
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6225d3283ce14889d040bc0fd98582b1202bb0fc2612f8b1a75d6026b1beec62
|
| 3 |
+
size 2678453323
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:731db04fa76766ce4be0757cd8fc8419932599afbe63d7971fc823629e086339
|
| 3 |
+
size 2678267439
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/training_state.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"global_step": 200000,
|
| 3 |
+
"update_step": 100000,
|
| 4 |
+
"tokens_seen": 9998828000,
|
| 5 |
+
"tokens_seen_before": 9998725848,
|
| 6 |
+
"update_time": 1.1641998291015625
|
| 7 |
+
}
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"wandb_id": "8ks57nb9"
|
| 3 |
+
}
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/files/requirements.txt
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sac==0.1.0
|
| 2 |
+
packaging==26.0
|
| 3 |
+
setuptools==82.0.1
|
| 4 |
+
wheel==0.46.3
|
| 5 |
+
pip==26.0.1
|
| 6 |
+
torchaudio==2.11.0
|
| 7 |
+
nvidia-cusparselt-cu12==0.7.1
|
| 8 |
+
mpmath==1.3.0
|
| 9 |
+
typing_extensions==4.15.0
|
| 10 |
+
triton==3.4.0
|
| 11 |
+
sympy==1.14.0
|
| 12 |
+
pillow==12.2.0
|
| 13 |
+
nvidia-nvtx-cu12==12.8.90
|
| 14 |
+
nvidia-nvjitlink-cu12==12.8.93
|
| 15 |
+
nvidia-nccl-cu12==2.27.3
|
| 16 |
+
nvidia-curand-cu12==10.3.9.90
|
| 17 |
+
nvidia-cufile-cu12==1.13.1.3
|
| 18 |
+
nvidia-cuda-runtime-cu12==12.8.90
|
| 19 |
+
nvidia-cuda-nvrtc-cu12==12.8.93
|
| 20 |
+
nvidia-cuda-cupti-cu12==12.8.90
|
| 21 |
+
nvidia-cublas-cu12==12.8.4.1
|
| 22 |
+
numpy==2.2.6
|
| 23 |
+
networkx==3.4.2
|
| 24 |
+
MarkupSafe==3.0.3
|
| 25 |
+
aiohappyeyeballs==2.6.1
|
| 26 |
+
filelock==3.28.0
|
| 27 |
+
nvidia-cusparse-cu12==12.5.8.93
|
| 28 |
+
nvidia-cufft-cu12==11.3.3.83
|
| 29 |
+
nvidia-cudnn-cu12==9.10.2.21
|
| 30 |
+
Jinja2==3.1.6
|
| 31 |
+
nvidia-cusolver-cu12==11.7.3.90
|
| 32 |
+
torch==2.8.0+cu128
|
| 33 |
+
torchvision==0.23.0+cu128
|
| 34 |
+
pytz==2026.1.post1
|
| 35 |
+
xxhash==3.6.0
|
| 36 |
+
urllib3==2.6.3
|
| 37 |
+
tzdata==2026.1
|
| 38 |
+
tqdm==4.67.3
|
| 39 |
+
six==1.17.0
|
| 40 |
+
safetensors==0.7.0
|
| 41 |
+
regex==2026.4.4
|
| 42 |
+
PyYAML==6.0.3
|
| 43 |
+
pyarrow==23.0.1
|
| 44 |
+
psutil==7.2.2
|
| 45 |
+
propcache==0.4.1
|
| 46 |
+
multidict==6.7.1
|
| 47 |
+
idna==3.11
|
| 48 |
+
hf-xet==1.4.3
|
| 49 |
+
h11==0.16.0
|
| 50 |
+
fsspec==2026.2.0
|
| 51 |
+
frozenlist==1.8.0
|
| 52 |
+
exceptiongroup==1.3.1
|
| 53 |
+
dill==0.4.1
|
| 54 |
+
charset-normalizer==3.4.7
|
| 55 |
+
certifi==2026.2.25
|
| 56 |
+
attrs==26.1.0
|
| 57 |
+
async-timeout==5.0.1
|
| 58 |
+
yarl==1.23.0
|
| 59 |
+
requests==2.33.1
|
| 60 |
+
python-dateutil==2.9.0.post0
|
| 61 |
+
multiprocess==0.70.19
|
| 62 |
+
httpcore==1.0.9
|
| 63 |
+
anyio==4.13.0
|
| 64 |
+
aiosignal==1.4.0
|
| 65 |
+
pandas==2.3.3
|
| 66 |
+
huggingface_hub==0.36.2
|
| 67 |
+
httpx==0.28.1
|
| 68 |
+
aiohttp==3.13.5
|
| 69 |
+
tokenizers==0.22.2
|
| 70 |
+
accelerate==1.13.0
|
| 71 |
+
transformers==4.57.3
|
| 72 |
+
datasets==4.8.4
|
| 73 |
+
peft==0.19.1
|
| 74 |
+
pytorch-ranger==0.1.1
|
| 75 |
+
lion-pytorch==0.2.4
|
| 76 |
+
bitsandbytes==0.49.2
|
| 77 |
+
torch-optimizer==0.3.0
|
| 78 |
+
apollo-torch==1.0.3
|
| 79 |
+
nvidia-ml-py==13.590.48
|
| 80 |
+
typing-inspection==0.4.2
|
| 81 |
+
threadpoolctl==3.6.0
|
| 82 |
+
smmap==5.0.3
|
| 83 |
+
sentry-sdk==2.58.0
|
| 84 |
+
scipy==1.15.3
|
| 85 |
+
pyparsing==3.3.2
|
| 86 |
+
pydantic_core==2.46.3
|
| 87 |
+
protobuf==7.34.1
|
| 88 |
+
platformdirs==4.9.6
|
| 89 |
+
nvitop==1.6.2
|
| 90 |
+
loguru==0.7.3
|
| 91 |
+
kiwisolver==1.5.0
|
| 92 |
+
joblib==1.5.3
|
| 93 |
+
fonttools==4.62.1
|
| 94 |
+
cycler==0.12.1
|
| 95 |
+
contourpy==1.3.2
|
| 96 |
+
click==8.3.2
|
| 97 |
+
annotated-types==0.7.0
|
| 98 |
+
scikit-learn==1.7.2
|
| 99 |
+
pydantic==2.13.3
|
| 100 |
+
modelscope==1.35.4
|
| 101 |
+
matplotlib==3.10.8
|
| 102 |
+
gitdb==4.0.12
|
| 103 |
+
seaborn==0.13.2
|
| 104 |
+
GitPython==3.1.47
|
| 105 |
+
wandb==0.26.0
|
| 106 |
+
sac==0.1.0
|
| 107 |
+
nvidia-ml-py3==7.352.0
|
| 108 |
+
gitignore_parser==0.1.13
|
| 109 |
+
durationpy==0.10
|
| 110 |
+
dotmap==1.3.30
|
| 111 |
+
wrapt==2.1.2
|
| 112 |
+
websocket-client==1.9.0
|
| 113 |
+
typeguard==4.5.1
|
| 114 |
+
tabulate==0.9.0
|
| 115 |
+
pycparser==3.0
|
| 116 |
+
pyasn1==0.6.3
|
| 117 |
+
py==1.11.0
|
| 118 |
+
oauthlib==3.3.1
|
| 119 |
+
jmespath==1.1.0
|
| 120 |
+
invoke==3.0.3
|
| 121 |
+
elasticsearch==7.17.13
|
| 122 |
+
docutils==0.19
|
| 123 |
+
decorator==5.2.1
|
| 124 |
+
confluent-kafka==2.14.0
|
| 125 |
+
colorama==0.4.6
|
| 126 |
+
bcrypt==5.0.0
|
| 127 |
+
rsa==4.7.2
|
| 128 |
+
retry==0.9.2
|
| 129 |
+
requests-oauthlib==2.0.0
|
| 130 |
+
Deprecated==1.3.1
|
| 131 |
+
cffi==2.0.0
|
| 132 |
+
botocore==1.42.92
|
| 133 |
+
s3transfer==0.16.0
|
| 134 |
+
PyNaCl==1.6.2
|
| 135 |
+
kubernetes==35.0.0
|
| 136 |
+
cryptography==46.0.7
|
| 137 |
+
paramiko==4.0.0
|
| 138 |
+
boto3==1.42.92
|
| 139 |
+
awscli==1.44.82
|
| 140 |
+
megfile==2.2.10.post1
|
| 141 |
+
refile==7.2.7.post3
|
| 142 |
+
brainpp==2.7.12.16
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug-internal.log
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"time":"2026-05-08T00:43:06.142473188+08:00","level":"INFO","msg":"wandb-core"}
|
| 2 |
+
{"time":"2026-05-08T00:43:06.142771547+08:00","level":"INFO","msg":"stream: starting","core version":"0.26.0"}
|
| 3 |
+
{"time":"2026-05-08T00:43:06.48373365+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
|
| 4 |
+
{"time":"2026-05-08T00:43:06.483785234+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
|
| 5 |
+
{"time":"2026-05-08T00:43:06.4838066+08:00","level":"INFO","msg":"stream: created new stream","id":"8ks57nb9"}
|
| 6 |
+
{"time":"2026-05-08T00:43:06.483964438+08:00","level":"INFO","msg":"handler: started"}
|
| 7 |
+
{"time":"2026-05-08T00:43:06.485107308+08:00","level":"INFO","msg":"stream: started"}
|
| 8 |
+
{"time":"2026-05-08T00:43:06.485201217+08:00","level":"INFO","msg":"writer: started","stream_id":"8ks57nb9"}
|
| 9 |
+
{"time":"2026-05-08T00:43:06.485269288+08:00","level":"INFO","msg":"sender: started"}
|
| 10 |
+
{"time":"2026-05-08T00:43:06.486938544+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
|
| 11 |
+
{"time":"2026-05-08T00:43:06.486955503+08:00","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
|
| 12 |
+
{"time":"2026-05-08T13:02:42.775825964+08:00","level":"INFO","msg":"stream: closing"}
|
| 13 |
+
{"time":"2026-05-08T13:02:42.801758293+08:00","level":"INFO","msg":"handler: closed"}
|
| 14 |
+
{"time":"2026-05-08T13:02:42.803679703+08:00","level":"INFO","msg":"sender: closed"}
|
| 15 |
+
{"time":"2026-05-08T13:02:42.803699519+08:00","level":"INFO","msg":"stream: closed"}
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug.log
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
|
| 2 |
+
2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_setup.py:_flush():81] Configure stats pid to 476
|
| 3 |
+
2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_setup.py:_flush():81] Loading settings from environment variables
|
| 4 |
+
2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_init.py:setup_run_log_directory():721] Logging user logs to exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug.log
|
| 5 |
+
2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug-internal.log
|
| 6 |
+
2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:init():848] calling init triggers
|
| 7 |
+
2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
|
| 8 |
+
config: {'_wandb': {}}
|
| 9 |
+
2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:init():896] starting backend
|
| 10 |
+
2026-05-08 00:43:06,137 INFO MainThread:476 [wandb_init.py:init():911] sending inform_init request
|
| 11 |
+
2026-05-08 00:43:06,141 INFO MainThread:476 [wandb_init.py:init():919] backend started and connected
|
| 12 |
+
2026-05-08 00:43:06,141 INFO MainThread:476 [wandb_init.py:init():989] updated telemetry
|
| 13 |
+
2026-05-08 00:43:06,157 INFO MainThread:476 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
|
| 14 |
+
2026-05-08 00:43:06,488 INFO MainThread:476 [wandb_init.py:init():1058] starting run threads in backend
|
| 15 |
+
2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_console_start():2542] atexit reg
|
| 16 |
+
2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_redirect():2391] redirect: wrap_raw
|
| 17 |
+
2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_redirect():2460] Wrapping output streams.
|
| 18 |
+
2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_redirect():2483] Redirects installed.
|
| 19 |
+
2026-05-08 00:43:06,563 INFO MainThread:476 [wandb_init.py:init():1098] run started, returning control to user process
|
| 20 |
+
2026-05-08 00:43:42,091 INFO MainThread:476 [wandb_run.py:_config_callback():1403] config_cb None None {'model_config': 'configs/llama_1b.json', 'exp_config': 'exp_v2/configs/llama_1b.json', 'eval_every': 1000, 'save_every': 100000, 'dtype': 'bfloat16', 'seed': 0, 'compile': True, 'dynamo_suppress_errors': True, 'dynamo_cache_limit': 10000, 'memory_cleanup_frequency': 10000, 'resume_step': None, 'restore_optimizer': False, 'continue_from': None, 'single_gpu': False, 'save_dir': 'exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255', 'use_hf_model': False, 'workers': 12, 'batch_size': 32, 'gradient_accumulation': 2, 'total_batch_size': 512, 'adam_lr': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.99, 'warmup_steps': 10000, 'num_training_steps': 100000, 'max_train_tokens': None, 'optimizer': 'rmnp', 'max_length': 256, 'scheduler': 'cosine', 'min_lr_ratio': 0.1, 'weight_decay': 0.0, 'grad_clipping': 0.0, 'activation_checkpointing': False, 'data_path': '/mnt/shared-storage-gpfs2/finebio-shared/optimizer/dataset/C4/en', 'data_name': 'en', 'tags': None, 'name': 'test', 'project': 'test', 'unset_wandb': False, 'entity': None, 'wandb_dir': 'exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255', 'beta1': 0.95, 'beta2': 0.95, 'beta3': 0.99, 'eps': 1e-08, 'rank': 128, 'update_proj_gap': 50, 'galore_scale': 1.0, 'proj_type': 'std', 'proj_quant': False, 'proj_bits': 8, 'proj_group_size': 256, 'weight_quant': False, 'weight_bits': 8, 'weight_group_size': 256, 'stochastic_round': False, 'simulation': False, 'cos_threshold': 1, 'gamma_proj': 2, 'queue_size': 5, 'proj': 'random', 'scale_type': 'tensor', 'apollo_scale': 1.0, 'scale_front': False, 'n_clusters': 3, 'scale_update_freq': 500, 'scale_level': '1,0,1,1', 'scale_bound': None, 'metric': 'mean', 'align_grad': False, 'dim': 4096, 'n_heads': 32, 'muon_ns_steps': 5, 'muon_momentum': 0.95, 'nproc_per_node': 8, 'max_lr': 0.01, 'total_params_M': 1339.082752, 'dataset': 'c4', 'model': {'vocab_size': 32000, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5461, 'num_hidden_layers': 24, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': None, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['LLaMAForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 0, 'pad_token_id': -1, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': 'configs/llama_1b.json', 'transformers_version': '4.57.3', 'max_sequence_length': 1024, 'model_type': 'llama', 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False}, 'world_size': 8, 'device': 'cuda:0'}
|
| 21 |
+
2026-05-08 13:02:42,775 INFO wandb-AsyncioManager-main:476 [service_client.py:_forward_responses():134] Reached EOF.
|
| 22 |
+
2026-05-08 13:02:42,790 INFO wandb-AsyncioManager-main:476 [mailbox.py:close():155] Closing mailbox, abandoning 0 handles.
|
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/run-8ks57nb9.wandb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5d7c73103b8c7149372afc5744eb6ccaee46c7b072b52c6dcd20207360b3e00
|
| 3 |
+
size 115043530
|