Sssplendid commited on
Commit
92db1a0
·
verified ·
1 Parent(s): 190f7c3

Add 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255

Browse files
Files changed (12) hide show
  1. .gitattributes +1 -0
  2. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/config.json +31 -0
  3. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/model.safetensors +3 -0
  4. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/optimizer.pt +3 -0
  5. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/pytorch_model.bin +3 -0
  6. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/training_state.json +7 -0
  7. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255.txt +0 -0
  8. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb.json +3 -0
  9. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/files/requirements.txt +142 -0
  10. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug-internal.log +15 -0
  11. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug.log +22 -0
  12. 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/run-8ks57nb9.wandb +3 -0
.gitattributes CHANGED
@@ -125,3 +125,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
125
  1b/shampoo_lr1e_2_b1_0_9_b2_0_999_eps_1e_8_A100_ppl_14_2854_20260421_102432/wandb/offline-run-20260421_102458-capz4d8i/run-capz4d8i.wandb filter=lfs diff=lfs merge=lfs -text
126
  1b/mars_lion_lr2e_4_b1_0_9_b2_0_98_eps_1e_8_A100_ppl_15_7263_20260420_132323/wandb/offline-run-20260420_132356-glmzsqdq/run-glmzsqdq.wandb filter=lfs diff=lfs merge=lfs -text
127
  1b/muon_lr6e_3_b1_0_9_b2_0_95_eps_1e_8_A100_ppl_13_7270_20260420_132324/wandb/offline-run-20260420_132357-0qywxs3n/run-0qywxs3n.wandb filter=lfs diff=lfs merge=lfs -text
 
 
125
  1b/shampoo_lr1e_2_b1_0_9_b2_0_999_eps_1e_8_A100_ppl_14_2854_20260421_102432/wandb/offline-run-20260421_102458-capz4d8i/run-capz4d8i.wandb filter=lfs diff=lfs merge=lfs -text
126
  1b/mars_lion_lr2e_4_b1_0_9_b2_0_98_eps_1e_8_A100_ppl_15_7263_20260420_132323/wandb/offline-run-20260420_132356-glmzsqdq/run-glmzsqdq.wandb filter=lfs diff=lfs merge=lfs -text
127
  1b/muon_lr6e_3_b1_0_9_b2_0_95_eps_1e_8_A100_ppl_13_7270_20260420_132324/wandb/offline-run-20260420_132357-0qywxs3n/run-0qywxs3n.wandb filter=lfs diff=lfs merge=lfs -text
128
+ 1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/run-8ks57nb9.wandb filter=lfs diff=lfs merge=lfs -text
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 1,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2048,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 5461,
15
+ "max_position_embeddings": 2048,
16
+ "max_sequence_length": 1024,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 32,
20
+ "num_hidden_layers": 24,
21
+ "num_key_value_heads": 32,
22
+ "pad_token_id": -1,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "tie_word_embeddings": false,
28
+ "transformers_version": "4.57.3",
29
+ "use_cache": true,
30
+ "vocab_size": 32000
31
+ }
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40d2e333531014a8c861a0868e5462d78498b534c2cfc95215ca7a5a06e49d27
3
+ size 2678195080
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6225d3283ce14889d040bc0fd98582b1202bb0fc2612f8b1a75d6026b1beec62
3
+ size 2678453323
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:731db04fa76766ce4be0757cd8fc8419932599afbe63d7971fc823629e086339
3
+ size 2678267439
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/model_100000/training_state.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 200000,
3
+ "update_step": 100000,
4
+ "tokens_seen": 9998828000,
5
+ "tokens_seen_before": 9998725848,
6
+ "update_time": 1.1641998291015625
7
+ }
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255.txt ADDED
The diff for this file is too large to render. See raw diff
 
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "wandb_id": "8ks57nb9"
3
+ }
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/files/requirements.txt ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sac==0.1.0
2
+ packaging==26.0
3
+ setuptools==82.0.1
4
+ wheel==0.46.3
5
+ pip==26.0.1
6
+ torchaudio==2.11.0
7
+ nvidia-cusparselt-cu12==0.7.1
8
+ mpmath==1.3.0
9
+ typing_extensions==4.15.0
10
+ triton==3.4.0
11
+ sympy==1.14.0
12
+ pillow==12.2.0
13
+ nvidia-nvtx-cu12==12.8.90
14
+ nvidia-nvjitlink-cu12==12.8.93
15
+ nvidia-nccl-cu12==2.27.3
16
+ nvidia-curand-cu12==10.3.9.90
17
+ nvidia-cufile-cu12==1.13.1.3
18
+ nvidia-cuda-runtime-cu12==12.8.90
19
+ nvidia-cuda-nvrtc-cu12==12.8.93
20
+ nvidia-cuda-cupti-cu12==12.8.90
21
+ nvidia-cublas-cu12==12.8.4.1
22
+ numpy==2.2.6
23
+ networkx==3.4.2
24
+ MarkupSafe==3.0.3
25
+ aiohappyeyeballs==2.6.1
26
+ filelock==3.28.0
27
+ nvidia-cusparse-cu12==12.5.8.93
28
+ nvidia-cufft-cu12==11.3.3.83
29
+ nvidia-cudnn-cu12==9.10.2.21
30
+ Jinja2==3.1.6
31
+ nvidia-cusolver-cu12==11.7.3.90
32
+ torch==2.8.0+cu128
33
+ torchvision==0.23.0+cu128
34
+ pytz==2026.1.post1
35
+ xxhash==3.6.0
36
+ urllib3==2.6.3
37
+ tzdata==2026.1
38
+ tqdm==4.67.3
39
+ six==1.17.0
40
+ safetensors==0.7.0
41
+ regex==2026.4.4
42
+ PyYAML==6.0.3
43
+ pyarrow==23.0.1
44
+ psutil==7.2.2
45
+ propcache==0.4.1
46
+ multidict==6.7.1
47
+ idna==3.11
48
+ hf-xet==1.4.3
49
+ h11==0.16.0
50
+ fsspec==2026.2.0
51
+ frozenlist==1.8.0
52
+ exceptiongroup==1.3.1
53
+ dill==0.4.1
54
+ charset-normalizer==3.4.7
55
+ certifi==2026.2.25
56
+ attrs==26.1.0
57
+ async-timeout==5.0.1
58
+ yarl==1.23.0
59
+ requests==2.33.1
60
+ python-dateutil==2.9.0.post0
61
+ multiprocess==0.70.19
62
+ httpcore==1.0.9
63
+ anyio==4.13.0
64
+ aiosignal==1.4.0
65
+ pandas==2.3.3
66
+ huggingface_hub==0.36.2
67
+ httpx==0.28.1
68
+ aiohttp==3.13.5
69
+ tokenizers==0.22.2
70
+ accelerate==1.13.0
71
+ transformers==4.57.3
72
+ datasets==4.8.4
73
+ peft==0.19.1
74
+ pytorch-ranger==0.1.1
75
+ lion-pytorch==0.2.4
76
+ bitsandbytes==0.49.2
77
+ torch-optimizer==0.3.0
78
+ apollo-torch==1.0.3
79
+ nvidia-ml-py==13.590.48
80
+ typing-inspection==0.4.2
81
+ threadpoolctl==3.6.0
82
+ smmap==5.0.3
83
+ sentry-sdk==2.58.0
84
+ scipy==1.15.3
85
+ pyparsing==3.3.2
86
+ pydantic_core==2.46.3
87
+ protobuf==7.34.1
88
+ platformdirs==4.9.6
89
+ nvitop==1.6.2
90
+ loguru==0.7.3
91
+ kiwisolver==1.5.0
92
+ joblib==1.5.3
93
+ fonttools==4.62.1
94
+ cycler==0.12.1
95
+ contourpy==1.3.2
96
+ click==8.3.2
97
+ annotated-types==0.7.0
98
+ scikit-learn==1.7.2
99
+ pydantic==2.13.3
100
+ modelscope==1.35.4
101
+ matplotlib==3.10.8
102
+ gitdb==4.0.12
103
+ seaborn==0.13.2
104
+ GitPython==3.1.47
105
+ wandb==0.26.0
106
+ sac==0.1.0
107
+ nvidia-ml-py3==7.352.0
108
+ gitignore_parser==0.1.13
109
+ durationpy==0.10
110
+ dotmap==1.3.30
111
+ wrapt==2.1.2
112
+ websocket-client==1.9.0
113
+ typeguard==4.5.1
114
+ tabulate==0.9.0
115
+ pycparser==3.0
116
+ pyasn1==0.6.3
117
+ py==1.11.0
118
+ oauthlib==3.3.1
119
+ jmespath==1.1.0
120
+ invoke==3.0.3
121
+ elasticsearch==7.17.13
122
+ docutils==0.19
123
+ decorator==5.2.1
124
+ confluent-kafka==2.14.0
125
+ colorama==0.4.6
126
+ bcrypt==5.0.0
127
+ rsa==4.7.2
128
+ retry==0.9.2
129
+ requests-oauthlib==2.0.0
130
+ Deprecated==1.3.1
131
+ cffi==2.0.0
132
+ botocore==1.42.92
133
+ s3transfer==0.16.0
134
+ PyNaCl==1.6.2
135
+ kubernetes==35.0.0
136
+ cryptography==46.0.7
137
+ paramiko==4.0.0
138
+ boto3==1.42.92
139
+ awscli==1.44.82
140
+ megfile==2.2.10.post1
141
+ refile==7.2.7.post3
142
+ brainpp==2.7.12.16
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-08T00:43:06.142473188+08:00","level":"INFO","msg":"wandb-core"}
2
+ {"time":"2026-05-08T00:43:06.142771547+08:00","level":"INFO","msg":"stream: starting","core version":"0.26.0"}
3
+ {"time":"2026-05-08T00:43:06.48373365+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
4
+ {"time":"2026-05-08T00:43:06.483785234+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
5
+ {"time":"2026-05-08T00:43:06.4838066+08:00","level":"INFO","msg":"stream: created new stream","id":"8ks57nb9"}
6
+ {"time":"2026-05-08T00:43:06.483964438+08:00","level":"INFO","msg":"handler: started"}
7
+ {"time":"2026-05-08T00:43:06.485107308+08:00","level":"INFO","msg":"stream: started"}
8
+ {"time":"2026-05-08T00:43:06.485201217+08:00","level":"INFO","msg":"writer: started","stream_id":"8ks57nb9"}
9
+ {"time":"2026-05-08T00:43:06.485269288+08:00","level":"INFO","msg":"sender: started"}
10
+ {"time":"2026-05-08T00:43:06.486938544+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
11
+ {"time":"2026-05-08T00:43:06.486955503+08:00","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
12
+ {"time":"2026-05-08T13:02:42.775825964+08:00","level":"INFO","msg":"stream: closing"}
13
+ {"time":"2026-05-08T13:02:42.801758293+08:00","level":"INFO","msg":"handler: closed"}
14
+ {"time":"2026-05-08T13:02:42.803679703+08:00","level":"INFO","msg":"sender: closed"}
15
+ {"time":"2026-05-08T13:02:42.803699519+08:00","level":"INFO","msg":"stream: closed"}
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug.log ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_setup.py:_flush():81] Current SDK version is 0.26.0
2
+ 2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_setup.py:_flush():81] Configure stats pid to 476
3
+ 2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-08 00:43:05,740 INFO MainThread:476 [wandb_init.py:setup_run_log_directory():721] Logging user logs to exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug.log
5
+ 2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:setup_run_log_directory():722] Logging internal logs to exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/logs/debug-internal.log
6
+ 2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:init():848] calling init triggers
7
+ 2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:init():853] wandb.init called with sweep_config: {}
8
+ config: {'_wandb': {}}
9
+ 2026-05-08 00:43:05,741 INFO MainThread:476 [wandb_init.py:init():896] starting backend
10
+ 2026-05-08 00:43:06,137 INFO MainThread:476 [wandb_init.py:init():911] sending inform_init request
11
+ 2026-05-08 00:43:06,141 INFO MainThread:476 [wandb_init.py:init():919] backend started and connected
12
+ 2026-05-08 00:43:06,141 INFO MainThread:476 [wandb_init.py:init():989] updated telemetry
13
+ 2026-05-08 00:43:06,157 INFO MainThread:476 [wandb_init.py:init():1013] communicating run to backend with 90.0 second timeout
14
+ 2026-05-08 00:43:06,488 INFO MainThread:476 [wandb_init.py:init():1058] starting run threads in backend
15
+ 2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_console_start():2542] atexit reg
16
+ 2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_redirect():2391] redirect: wrap_raw
17
+ 2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_redirect():2460] Wrapping output streams.
18
+ 2026-05-08 00:43:06,561 INFO MainThread:476 [wandb_run.py:_redirect():2483] Redirects installed.
19
+ 2026-05-08 00:43:06,563 INFO MainThread:476 [wandb_init.py:init():1098] run started, returning control to user process
20
+ 2026-05-08 00:43:42,091 INFO MainThread:476 [wandb_run.py:_config_callback():1403] config_cb None None {'model_config': 'configs/llama_1b.json', 'exp_config': 'exp_v2/configs/llama_1b.json', 'eval_every': 1000, 'save_every': 100000, 'dtype': 'bfloat16', 'seed': 0, 'compile': True, 'dynamo_suppress_errors': True, 'dynamo_cache_limit': 10000, 'memory_cleanup_frequency': 10000, 'resume_step': None, 'restore_optimizer': False, 'continue_from': None, 'single_gpu': False, 'save_dir': 'exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255', 'use_hf_model': False, 'workers': 12, 'batch_size': 32, 'gradient_accumulation': 2, 'total_batch_size': 512, 'adam_lr': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.99, 'warmup_steps': 10000, 'num_training_steps': 100000, 'max_train_tokens': None, 'optimizer': 'rmnp', 'max_length': 256, 'scheduler': 'cosine', 'min_lr_ratio': 0.1, 'weight_decay': 0.0, 'grad_clipping': 0.0, 'activation_checkpointing': False, 'data_path': '/mnt/shared-storage-gpfs2/finebio-shared/optimizer/dataset/C4/en', 'data_name': 'en', 'tags': None, 'name': 'test', 'project': 'test', 'unset_wandb': False, 'entity': None, 'wandb_dir': 'exp_remain_h200/work_dirs/1b/train_1b_rmnp_lr1e_2_b1_0_95_b2_0_95_adamlr1e_2_ab1_0_9_ab2_0_99_eps_1e_8_20260508_004255', 'beta1': 0.95, 'beta2': 0.95, 'beta3': 0.99, 'eps': 1e-08, 'rank': 128, 'update_proj_gap': 50, 'galore_scale': 1.0, 'proj_type': 'std', 'proj_quant': False, 'proj_bits': 8, 'proj_group_size': 256, 'weight_quant': False, 'weight_bits': 8, 'weight_group_size': 256, 'stochastic_round': False, 'simulation': False, 'cos_threshold': 1, 'gamma_proj': 2, 'queue_size': 5, 'proj': 'random', 'scale_type': 'tensor', 'apollo_scale': 1.0, 'scale_front': False, 'n_clusters': 3, 'scale_update_freq': 500, 'scale_level': '1,0,1,1', 'scale_bound': None, 'metric': 'mean', 'align_grad': False, 'dim': 4096, 'n_heads': 32, 'muon_ns_steps': 5, 'muon_momentum': 0.95, 'nproc_per_node': 8, 'max_lr': 0.01, 'total_params_M': 1339.082752, 'dataset': 'c4', 'model': {'vocab_size': 32000, 'max_position_embeddings': 2048, 'hidden_size': 2048, 'intermediate_size': 5461, 'num_hidden_layers': 24, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': None, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['LLaMAForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 0, 'pad_token_id': -1, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': 'configs/llama_1b.json', 'transformers_version': '4.57.3', 'max_sequence_length': 1024, 'model_type': 'llama', 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False}, 'world_size': 8, 'device': 'cuda:0'}
21
+ 2026-05-08 13:02:42,775 INFO wandb-AsyncioManager-main:476 [service_client.py:_forward_responses():134] Reached EOF.
22
+ 2026-05-08 13:02:42,790 INFO wandb-AsyncioManager-main:476 [mailbox.py:close():155] Closing mailbox, abandoning 0 handles.
1b/rmnp_lr1e_2_b1_0_95_b2_0_95_eps_1e_8_adamlr1e_2_ab1_0_9_ab2_0_99_H200_ppl13_8690_20260508_004255/wandb/offline-run-20260508_004305-8ks57nb9/run-8ks57nb9.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5d7c73103b8c7149372afc5744eb6ccaee46c7b072b52c6dcd20207360b3e00
3
+ size 115043530