Sssplendid commited on
Commit
1020068
·
verified ·
1 Parent(s): af68ffd

Add 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621

Browse files
Files changed (16) hide show
  1. .gitattributes +1 -0
  2. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621.txt +0 -0
  3. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/config.json +31 -0
  4. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/model.safetensors +3 -0
  5. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/optimizer.pt +3 -0
  6. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/pytorch_model.bin +3 -0
  7. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/training_state.json +7 -0
  8. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb.json +3 -0
  9. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/debug-internal.log +12 -0
  10. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/debug.log +24 -0
  11. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/files/SAC/torchrun_main.py +603 -0
  12. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/files/requirements.txt +134 -0
  13. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug-core.log +196 -0
  14. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug-internal.log +12 -0
  15. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug.log +24 -0
  16. 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/run-dhgtud9k.wandb +3 -0
.gitattributes CHANGED
@@ -90,3 +90,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
90
  350m/adamp_lr1e_3_b1_0_9_b2_0_98_eps_1e_8_A100_ppl_17_3947_20260416_223206/wandb/offline-run-20260416_223228-93itajip/run-93itajip.wandb filter=lfs diff=lfs merge=lfs -text
91
  350m/adamw_lr1e_3_b1_0_9_b2_0_99_eps_1e_8_A100_ppl_17_3227_20260416_225515/wandb/offline-run-20260416_225539-oqwr5f9l/run-oqwr5f9l.wandb filter=lfs diff=lfs merge=lfs -text
92
  350m/adan_lr3e_3_b1_0_9_b2_0_92_b3_0_99_eps_1e_8_A100_ppl_17_2895_20260416_200526/wandb/offline-run-20260416_200602-wk4fdtc0/run-wk4fdtc0.wandb filter=lfs diff=lfs merge=lfs -text
 
 
90
  350m/adamp_lr1e_3_b1_0_9_b2_0_98_eps_1e_8_A100_ppl_17_3947_20260416_223206/wandb/offline-run-20260416_223228-93itajip/run-93itajip.wandb filter=lfs diff=lfs merge=lfs -text
91
  350m/adamw_lr1e_3_b1_0_9_b2_0_99_eps_1e_8_A100_ppl_17_3227_20260416_225515/wandb/offline-run-20260416_225539-oqwr5f9l/run-oqwr5f9l.wandb filter=lfs diff=lfs merge=lfs -text
92
  350m/adan_lr3e_3_b1_0_9_b2_0_92_b3_0_99_eps_1e_8_A100_ppl_17_2895_20260416_200526/wandb/offline-run-20260416_200602-wk4fdtc0/run-wk4fdtc0.wandb filter=lfs diff=lfs merge=lfs -text
93
+ 350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/run-dhgtud9k.wandb filter=lfs diff=lfs merge=lfs -text
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621.txt ADDED
The diff for this file is too large to render. See raw diff
 
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 1,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2736,
15
+ "max_position_embeddings": 2048,
16
+ "max_sequence_length": 1024,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 16,
20
+ "num_hidden_layers": 24,
21
+ "num_key_value_heads": 16,
22
+ "pad_token_id": -1,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000.0,
27
+ "tie_word_embeddings": false,
28
+ "transformers_version": "4.57.3",
29
+ "use_cache": true,
30
+ "vocab_size": 32000
31
+ }
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c4f9a1a8aa6a34e444aaab39e5ba9a4bcad711e1318447e9639d0406603b372
3
+ size 735967792
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36799361bd4dabf143e316bf8e029b2be9cfd1eee87c99f22554c02cfb4d4830
3
+ size 653052922
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a0cdb46285cbb6b4582783d67b41e68bd1248c506a309fb698f7a78cd08052e
3
+ size 736040086
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/model_60000/training_state.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "global_step": 60000,
3
+ "update_step": 60000,
4
+ "tokens_seen": 5999255724,
5
+ "tokens_seen_before": 5999150160,
6
+ "update_time": 1.0080578327178955
7
+ }
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "wandb_id": "dhgtud9k"
3
+ }
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-19T23:47:18.48078063+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.0"}
2
+ {"time":"2026-04-19T23:47:18.612202918+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-04-19T23:47:18.612265953+08:00","level":"INFO","msg":"stream: created new stream","id":"dhgtud9k"}
4
+ {"time":"2026-04-19T23:47:18.612302459+08:00","level":"INFO","msg":"handler: started","stream_id":"dhgtud9k"}
5
+ {"time":"2026-04-19T23:47:18.617535039+08:00","level":"INFO","msg":"stream: started","id":"dhgtud9k"}
6
+ {"time":"2026-04-19T23:47:18.617543876+08:00","level":"INFO","msg":"writer: started","stream_id":"dhgtud9k"}
7
+ {"time":"2026-04-19T23:47:18.617549943+08:00","level":"INFO","msg":"sender: started","stream_id":"dhgtud9k"}
8
+ {"time":"2026-04-19T23:47:18.618872114+08:00","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
9
+ {"time":"2026-04-20T09:55:26.509801244+08:00","level":"INFO","msg":"stream: closing","id":"dhgtud9k"}
10
+ {"time":"2026-04-20T09:55:26.510476014+08:00","level":"INFO","msg":"handler: closed","stream_id":"dhgtud9k"}
11
+ {"time":"2026-04-20T09:55:26.511606678+08:00","level":"INFO","msg":"sender: closed","stream_id":"dhgtud9k"}
12
+ {"time":"2026-04-20T09:55:26.511618561+08:00","level":"INFO","msg":"stream: closed","id":"dhgtud9k"}
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-19 23:47:17,983 INFO MainThread:111045 [wandb_setup.py:_flush():80] Current SDK version is 0.23.0
2
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Configure stats pid to 111045
3
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Loading settings from /mnt/petrelfs/panjiabao/.config/wandb/settings
4
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Loading settings from /mnt/petrelfs/panjiabao/Optimizer/SAC/wandb/settings
5
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:setup_run_log_directory():713] Logging user logs to /mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug.log
7
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:setup_run_log_directory():714] Logging internal logs to /mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug-internal.log
8
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:init():840] calling init triggers
9
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:init():845] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:init():888] starting backend
12
+ 2026-04-19 23:47:18,397 INFO MainThread:111045 [wandb_init.py:init():891] sending inform_init request
13
+ 2026-04-19 23:47:18,425 INFO MainThread:111045 [wandb_init.py:init():899] backend started and connected
14
+ 2026-04-19 23:47:18,429 INFO MainThread:111045 [wandb_init.py:init():969] updated telemetry
15
+ 2026-04-19 23:47:18,474 INFO MainThread:111045 [wandb_init.py:init():993] communicating run to backend with 90.0 second timeout
16
+ 2026-04-19 23:47:18,620 INFO MainThread:111045 [wandb_init.py:init():1040] starting run threads in backend
17
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_console_start():2504] atexit reg
18
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_redirect():2352] redirect: wrap_raw
19
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_redirect():2421] Wrapping output streams.
20
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_redirect():2444] Redirects installed.
21
+ 2026-04-19 23:47:19,023 INFO MainThread:111045 [wandb_init.py:init():1080] run started, returning control to user process
22
+ 2026-04-19 23:47:27,035 INFO MainThread:111045 [wandb_run.py:_config_callback():1385] config_cb None None {'model_config': 'configs/llama_350m.json', 'exp_config': 'exp_v2/configs/llama_350m_apollo.json', 'eval_every': 1000, 'save_every': 60000, 'dtype': 'bfloat16', 'seed': 0, 'compile': True, 'dynamo_suppress_errors': True, 'dynamo_cache_limit': 10000, 'memory_cleanup_frequency': 10000, 'resume_step': None, 'restore_optimizer': False, 'continue_from': None, 'single_gpu': False, 'save_dir': '/mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621', 'use_hf_model': False, 'workers': 12, 'batch_size': 128, 'gradient_accumulation': 1, 'total_batch_size': 512, 'warmup_steps': 6000, 'num_training_steps': 60000, 'max_train_tokens': None, 'optimizer': 'apollo_adamw', 'max_length': 256, 'scheduler': 'cosine', 'min_lr_ratio': 0.1, 'weight_decay': 0.0, 'grad_clipping': 0.0, 'activation_checkpointing': False, 'data_path': '/mnt/dhwfile/tancheng/panjiabao/dataset/C4/en', 'data_name': 'en', 'tags': None, 'name': 'test', 'project': 'test', 'unset_wandb': False, 'entity': None, 'wandb_dir': '/mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621', 'beta1': 0.9, 'beta2': 0.99, 'beta3': 0.99, 'eps': 1e-06, 'rank': 256, 'update_proj_gap': 200, 'galore_scale': 1.0, 'proj_type': 'std', 'proj_quant': False, 'proj_bits': 8, 'proj_group_size': 256, 'weight_quant': False, 'weight_bits': 8, 'weight_group_size': 256, 'stochastic_round': False, 'simulation': False, 'cos_threshold': 1, 'gamma_proj': 2, 'queue_size': 5, 'proj': 'random', 'scale_type': 'channel', 'apollo_scale': 1, 'scale_front': False, 'n_clusters': 3, 'scale_update_freq': 500, 'scale_level': '1,0,1,1', 'scale_bound': None, 'metric': 'mean', 'align_grad': False, 'dim': 4096, 'n_heads': 32, 'muon_ns_steps': 5, 'muon_momentum': 0.95, 'nproc_per_node': 4, 'max_lr': 0.01, 'total_params_M': 367.96928, 'dataset': 'c4', 'model': {'vocab_size': 32000, 'max_position_embeddings': 2048, 'hidden_size': 1024, 'intermediate_size': 2736, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'num_key_value_heads': 16, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': None, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['LLaMAForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 0, 'pad_token_id': -1, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': 'configs/llama_350m.json', 'transformers_version': '4.57.3', 'max_sequence_length': 1024, 'model_type': 'llama', 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False}, 'world_size': 4, 'device': 'cuda:0'}
23
+ 2026-04-20 09:55:26,509 INFO wandb-AsyncioManager-main:111045 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2026-04-20 09:55:26,510 INFO wandb-AsyncioManager-main:111045 [mailbox.py:close():137] Closing mailbox, abandoning 0 handles.
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/files/SAC/torchrun_main.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ import os
7
+ import time
8
+ import json
9
+
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.distributed as dist
13
+
14
+ from tqdm import tqdm
15
+ from loguru import logger
16
+
17
+ import transformers
18
+
19
+ transformers.logging.set_verbosity_error()
20
+
21
+ import wandb
22
+
23
+ from utils.argparse import parse_args
24
+ from utils.setup import getting_svd_cnt, set_seed, setup_model, saving_model_weight, load_model_weight
25
+ from utils.optimizer_factory import setup_optimization
26
+ from utils.eval import evaluate_model
27
+ from utils.dataloader import setup_dataset
28
+ from utils.modeling_llama import LlamaForCausalLM
29
+ from utils.fake_quantization import QLinear
30
+ from utils.quantization import QScaleLinear
31
+
32
+
33
+ def main(args):
34
+ import torch
35
+ ############ Setup random seed ############
36
+ set_seed(args)
37
+
38
+ ############ Setup DDP environment ############
39
+ assert "LOCAL_RANK" in os.environ, "torchrun should set LOCAL_RANK"
40
+ global_rank = int(os.environ["RANK"])
41
+ local_rank = int(os.environ["LOCAL_RANK"])
42
+ world_size = int(os.environ["WORLD_SIZE"])
43
+ torch.cuda.set_device(local_rank)
44
+
45
+ logger.info(f"Global rank {global_rank}, local rank {local_rank}, device: {torch.cuda.current_device()}")
46
+ dist.init_process_group(backend="nccl", rank=global_rank, world_size=world_size)
47
+
48
+ logger.info("Process group initialized")
49
+ device = f"cuda:{local_rank}"
50
+
51
+ if global_rank != 0:
52
+ logger.remove() # turn off logger
53
+
54
+ logger.info(f"Using dist with rank {global_rank} (only rank 0 will log)")
55
+ logger.info("*" * 40)
56
+ logger.info(f"Starting training with the arguments")
57
+ for k, v in vars(args).items():
58
+ logger.info(f"{k:30} {v}")
59
+ logger.info("*" * 40)
60
+
61
+ ############ Initialize wandb without config (it is passed later) ############
62
+ if (not args.unset_wandb) and global_rank == 0:
63
+ if args.entity is None:
64
+ os.environ['WANDB_MODE'] = 'offline'
65
+ # Set wandb directory for offline mode
66
+ wandb_dir = getattr(args, 'wandb_dir', None) if getattr(args, 'wandb_dir', None) is not None else args.save_dir
67
+ if getattr(args, 'wandb_dir', None) is not None:
68
+ logger.info(f"Wandb directory set to: {wandb_dir}")
69
+ wandb.init(project=args.project, name=args.name, entity=args.entity, dir=wandb_dir)
70
+
71
+ ############ Setup training data ############
72
+ if args.total_batch_size is not None:
73
+ if args.gradient_accumulation is None:
74
+ assert args.total_batch_size % world_size == 0, "total_batch_size must be divisible by world_size"
75
+ args.gradient_accumulation = args.total_batch_size // (args.batch_size * world_size)
76
+ assert args.gradient_accumulation > 0, "gradient_accumulation must be greater than 0"
77
+
78
+ assert (
79
+ args.gradient_accumulation * args.batch_size * world_size == args.total_batch_size
80
+ ), "gradient_accumulation * batch_size * world_size must be equal to total_batch_size"
81
+
82
+ dataloader, tokenizer = setup_dataset(args, global_rank, world_size)
83
+
84
+ ############ Initialize model ############
85
+ model_config, model = setup_model(args)
86
+ # Ensure model has generation_config (fix for transformers version compatibility)
87
+ if model.generation_config is None:
88
+ from transformers import GenerationConfig
89
+ model.generation_config = GenerationConfig()
90
+ model.generation_config.pad_token_id = tokenizer.pad_token_id
91
+
92
+ ############ Resuming from checkpoints ############
93
+ global_step = 0
94
+ update_step = 0
95
+ beginning_step = 0
96
+ tokens_seen = 0
97
+ tokens_seen_before = 0
98
+
99
+ # identifying checkpointing
100
+ if args.continue_from is not None and os.path.exists(args.continue_from):
101
+ # searching the latest checkpoints
102
+ checkpoint_path_list = os.listdir(args.continue_from)
103
+ checkpoint_path_list = [int(x.split("_")[-1]) for x in checkpoint_path_list if x.startswith("model_")]
104
+ if len(checkpoint_path_list) > 0:
105
+ logger.info("Find Checkpoints", checkpoint_path_list)
106
+ beginning_step = max(checkpoint_path_list)
107
+ if args.resume_step is not None:
108
+ beginning_step = args.resume_step
109
+ args.continue_from = os.path.join(args.continue_from, f"model_{beginning_step}")
110
+ logger.info("Continue from", args.continue_from)
111
+ else:
112
+ logger.warning(f"Did not find any checkpoints in {args.continue_from}")
113
+ args.continue_from = None
114
+
115
+ # resuming from checkpointing
116
+ if args.continue_from is not None:
117
+ logger.info("*" * 40)
118
+ logger.info(f"Loading model from {args.continue_from}")
119
+ checkpoint_path = os.path.join(args.continue_from, "pytorch_model.bin")
120
+ if os.path.exists(checkpoint_path):
121
+ load_model_weight(model, checkpoint_path, args)
122
+ logger.info(f"Model successfully loaded (strict=False policy)")
123
+ else:
124
+ # Try safetensors format
125
+ checkpoint_path = os.path.join(args.continue_from, "model.safetensors")
126
+ if os.path.exists(checkpoint_path):
127
+ from safetensors import safe_open
128
+ tensors = {}
129
+ with safe_open(checkpoint_path, framework="pt", device=0) as f:
130
+ for k in f.keys():
131
+ tensors[k] = f.get_tensor(k)
132
+ print(k, tensors[k].shape)
133
+ ret = model.load_state_dict(tensors, strict=False)
134
+ logger.info(f"Model successfully loaded from safetensors (strict=False policy)", ret)
135
+ else:
136
+ logger.warning(f"No model checkpoint found in {args.continue_from}")
137
+
138
+ if os.path.exists(os.path.join(args.continue_from, "training_state.json")):
139
+ logger.info(
140
+ f"Loading training state like global_step, update_step, and tokens_seen from {args.continue_from}"
141
+ )
142
+ with open(os.path.join(args.continue_from, "training_state.json")) as f:
143
+ _old_state = json.load(f)
144
+ global_step = _old_state["global_step"]
145
+ update_step = _old_state["update_step"]
146
+ tokens_seen = _old_state["tokens_seen"]
147
+ tokens_seen_before = _old_state["tokens_seen_before"]
148
+ logger.info(f"global_step : {global_step}")
149
+ logger.info(f"update_step : {update_step}")
150
+ logger.info(f"tokens_seen : {tokens_seen}")
151
+ logger.info(f"tokens_seen_before: {tokens_seen_before}")
152
+ logger.info(f"Will train for {args.num_training_steps - update_step} update steps")
153
+ else:
154
+ logger.warning(f"Did not find training state in {args.continue_from}, global step will start from zero")
155
+ logger.info("*" * 40)
156
+
157
+ ############ Setup model ############
158
+ if args.dtype in ["bf16", "bfloat16"]:
159
+ model = model.to(dtype=torch.bfloat16)
160
+ model = model.to(device=device)
161
+
162
+ for _, module in model.named_modules():
163
+ if isinstance(module, QScaleLinear):
164
+ weight_device = module.weight.device
165
+ module.weight.scales = module.weight.scales.to(device=weight_device)
166
+ module.weight.zeros = module.weight.zeros.to(device=weight_device)
167
+
168
+ n_total_params = sum(p.numel() for p in model.parameters())
169
+ trainable_params = [p for p in model.parameters() if p.requires_grad]
170
+ trainable_params_int8 = [p for p in model.parameters() if hasattr(p, "group_size")]
171
+
172
+ ############ Initialize wandb ############
173
+ run_config = dict(vars(args))
174
+ run_config.update(
175
+ {
176
+ "max_lr": run_config.pop("lr"), # rename lr to max_lr to avoid conflicts with scheduler
177
+ "total_params_M": n_total_params / 1_000_000,
178
+ "dataset": "c4",
179
+ "model": model_config.to_dict(),
180
+ "world_size": world_size,
181
+ "device": str(device),
182
+ }
183
+ )
184
+
185
+ if global_rank == 0:
186
+ if not args.unset_wandb:
187
+ wandb.config.update(run_config, allow_val_change=True)
188
+ wandb.save(os.path.abspath(__file__), policy="now") # save current script
189
+ # fix tqdm visual length to 80 so that the progress bar
190
+ # doesn't jump around when changing from external display to laptop
191
+ pbar = tqdm(total=args.num_training_steps - update_step, desc="Update steps", ncols=80)
192
+
193
+ ############ Initialize optimization ############
194
+ if "galore" in args.optimizer.lower():
195
+ # make parameters with "rank" to a single group, if param_name has "mlp" or "attn"
196
+ lowrank_params = []
197
+ target_modules_list = ["attn", "mlp"]
198
+ for module_name, module in model.named_modules():
199
+ if not (isinstance(module, nn.Linear) or isinstance(module, QScaleLinear) or isinstance(module, QLinear)):
200
+ continue
201
+ if not any(target_key in module_name for target_key in target_modules_list):
202
+ continue
203
+ logger.info(f"Adding {module_name} to GaLore parameters")
204
+ lowrank_params.append(module.weight)
205
+
206
+ id_lowrank_params = [id(p) for p in lowrank_params]
207
+ # make parameters without "rank" to another group
208
+ regular_params = [p for p in model.parameters() if id(p) not in id_lowrank_params]
209
+ # then call low rank optimizer
210
+ param_groups = [
211
+ {"params": regular_params},
212
+ {
213
+ "params": lowrank_params,
214
+ "rank": args.rank,
215
+ "update_proj_gap": args.update_proj_gap,
216
+ "scale": args.galore_scale,
217
+ "proj_type": args.proj_type,
218
+ "quant": args.proj_quant,
219
+ "quant_n_bit": args.proj_bits,
220
+ "quant_group_size": args.proj_group_size,
221
+ "cos_threshold": args.cos_threshold,
222
+ "gamma_proj": args.gamma_proj,
223
+ "queue_size": args.queue_size,
224
+ },
225
+ ]
226
+ elif "apollo" in args.optimizer.lower():
227
+ # make parameters with "rank" to a single group, if param_name has "mlp" or "attn"
228
+ lowrank_params = []
229
+ target_modules_list = ["attn", "mlp"]
230
+ for module_name, module in model.named_modules():
231
+ if not (isinstance(module, nn.Linear) or isinstance(module, QScaleLinear) or isinstance(module, QLinear)):
232
+ continue
233
+ if not any(target_key in module_name for target_key in target_modules_list):
234
+ continue
235
+ logger.info(f"Adding {module_name} to APOLLO parameters")
236
+ lowrank_params.append(module.weight)
237
+
238
+ id_lowrank_params = [id(p) for p in lowrank_params]
239
+ # make parameters without "rank" to another group
240
+ regular_params = [p for p in model.parameters() if id(p) not in id_lowrank_params]
241
+ # then call low rank optimizer
242
+ param_groups = [
243
+ {"params": regular_params},
244
+ {
245
+ "params": lowrank_params,
246
+ "rank": args.rank,
247
+ "update_proj_gap": args.update_proj_gap,
248
+ "scale": args.apollo_scale,
249
+ "proj_type": args.proj_type,
250
+ "proj": args.proj,
251
+ "scale_type": args.scale_type,
252
+ },
253
+ ]
254
+ elif "conda" in args.optimizer.lower():
255
+ # make parameters with "rank" to a single group, if param_name has "mlp" or "attn"
256
+ lowrank_params = []
257
+ target_modules_list = ["attn", "mlp"]
258
+ for module_name, module in model.named_modules():
259
+ if not (isinstance(module, nn.Linear) or isinstance(module, QScaleLinear) or isinstance(module, QLinear)):
260
+ continue
261
+ if not any(target_key in module_name for target_key in target_modules_list):
262
+ continue
263
+ logger.info(f"Adding {module_name} to conda parameters")
264
+ lowrank_params.append(module.weight)
265
+
266
+ id_lowrank_params = [id(p) for p in lowrank_params]
267
+ # make parameters without "rank" to another group
268
+ regular_params = [p for p in model.parameters() if id(p) not in id_lowrank_params]
269
+ # then call low rank optimizer
270
+ param_groups = [
271
+ {"params": regular_params},
272
+ {
273
+ "params": lowrank_params,
274
+ "rank": args.rank,
275
+ "update_proj_gap": args.update_proj_gap,
276
+ "scale": args.apollo_scale,
277
+ "proj_type": args.proj_type,
278
+ "proj": args.proj,
279
+ "scale_type": args.scale_type,
280
+ },
281
+ ]
282
+ else:
283
+ param_groups = None
284
+ id_lowrank_params = None
285
+
286
+ # print params and trainable params
287
+ logger.info(f"\n{model}\n")
288
+ logger.info(f"Total params: {sum(p.numel() for p in model.parameters()) / 1_000_000:.2f}M")
289
+
290
+ if args.simulation:
291
+ num_train_params = sum(p.numel() for p in trainable_params)
292
+ else:
293
+ num_train_params = sum(p.numel() for p in trainable_params) + sum(p.numel() for p in trainable_params_int8)
294
+
295
+ logger.info(f"Trainable params: {num_train_params / 1_000_000:.2f}M")
296
+ if "q_galore" in args.optimizer.lower():
297
+ logger.info(
298
+ f"Trainable params with Q-GaLore enabled: {sum(p.numel() for p in trainable_params_int8) / 1_000_000:.2f}M"
299
+ )
300
+ elif "galore" in args.optimizer.lower():
301
+ logger.info(f"Total params with GaLore enabled: {sum(p.numel() for p in lowrank_params) / 1_000_000:.2f}M")
302
+ elif "q_apollo" in args.optimizer.lower():
303
+ logger.info(
304
+ f"Trainable params with Q-APOLLO enabled: {sum(p.numel() for p in trainable_params_int8) / 1_000_000:.2f}M"
305
+ )
306
+ elif "apollo" in args.optimizer.lower():
307
+ logger.info(f"Total params with APOLLO enabled: {sum(p.numel() for p in lowrank_params) / 1_000_000:.2f}M")
308
+
309
+ logger.info(f"Saving model to {args.save_dir} every {args.save_every} update steps")
310
+
311
+ model, optimizer, scheduler, layer_wise_flag = setup_optimization(
312
+ args, model, trainable_params, param_groups, id_lowrank_params, model_config
313
+ )
314
+
315
+ if layer_wise_flag:
316
+ # will pass optimizer_dict and scheduler_dict out instead of optimizer and scheduler
317
+ optimizer_dict = optimizer
318
+ scheduler_dict = scheduler
319
+
320
+ # Bug-3 fix: wrap with DDP *before* torch.compile per PyTorch recommendation.
321
+ # This ensures gradient reduction hooks are correctly installed on the DDP module,
322
+ # and the compiled graph captures the full DDP+model forward pass.
323
+ # (Issue-5: optimizer.load_state_dict is called after both DDP and compile below.)
324
+ if not args.single_gpu:
325
+ model: LlamaForCausalLM = torch.nn.parallel.DistributedDataParallel(
326
+ model,
327
+ device_ids=[local_rank],
328
+ output_device=local_rank,
329
+ broadcast_buffers=False,
330
+ )
331
+
332
+ # compile the model (after DDP so the compiled graph includes DDP reduction)
333
+ if args.compile:
334
+ print("Compiling the model... (takes a ~minute)")
335
+ unoptimized_model = model
336
+
337
+ # Configure TorchDynamo to suppress errors and fall back to eager mode
338
+ import torch._dynamo
339
+ torch._dynamo.config.suppress_errors = args.dynamo_suppress_errors
340
+ torch._dynamo.config.verbose = False
341
+ # Set cache size limit to prevent memory issues during long training
342
+ torch._dynamo.config.cache_size_limit = args.dynamo_cache_limit
343
+
344
+ model = torch.compile(model) # requires PyTorch 2.0
345
+
346
+ # resume optimizer
347
+ if args.restore_optimizer and args.continue_from is not None:
348
+ logger.info("Restoring optimizer and scheduler from the checkpoint")
349
+ _optimizer_dir = args.continue_from
350
+ optimizer_checkpoint = torch.load(os.path.join(_optimizer_dir, "optimizer.pt"), map_location="cpu")
351
+ optimizer.load_state_dict(optimizer_checkpoint["optimizer"])
352
+ scheduler.load_state_dict(optimizer_checkpoint["scheduler"])
353
+ update_step = optimizer_checkpoint["update_step"]
354
+ beginning_step = update_step
355
+ global_step = optimizer_checkpoint["global_step"]
356
+ logger.info(f"Optimizer and scheduler restored from {_optimizer_dir}")
357
+
358
+ # ##############################
359
+ # TRAINING LOOP
360
+ # we use iterable dataset, so we may never go through all the data
361
+ # ##############################
362
+ # global steps and others are defined above
363
+ pad_idx = tokenizer.pad_token_id
364
+ update_time = time.time()
365
+ local_step = 0 # when continue_from is used, local_step != global_step
366
+ total_svd_count = 0
367
+
368
+ dataloader_iter = iter(dataloader)
369
+
370
+ # Issue-4 fix: accumulate loss across micro-batches so logged loss is the true
371
+ # gradient-accumulation average, not just the last micro-batch.
372
+ accumulated_loss = 0.0
373
+
374
+ # Skip data if resuming from checkpoint
375
+ if update_step != 0:
376
+ skip_batches = args.gradient_accumulation * update_step
377
+ logger.info(f"Skipping {skip_batches} batches to resume from update step {update_step}")
378
+ skipped = 0
379
+ for _ in range(skip_batches):
380
+ # Issue-6 fix: handle StopIteration during skip so all ranks stay aligned
381
+ try:
382
+ next(dataloader_iter)
383
+ except StopIteration:
384
+ logger.warning(
385
+ f"Dataset exhausted during skip at batch {skipped}/{skip_batches}; "
386
+ f"restarting iterator to keep ranks aligned."
387
+ )
388
+ dataloader_iter = iter(dataloader)
389
+ next(dataloader_iter)
390
+ skipped += 1
391
+ logger.info(f"Skipped {skipped} batches successfully")
392
+
393
+ while update_step <= args.num_training_steps:
394
+ try:
395
+ batch = next(dataloader_iter)
396
+ except StopIteration:
397
+ logger.info(f"Dataset completed one epoch. Starting new epoch with reshuffled data.")
398
+ dataloader_iter = iter(dataloader)
399
+ batch = next(dataloader_iter)
400
+
401
+ global_step += 1
402
+ local_step += 1
403
+
404
+ if update_step >= args.num_training_steps:
405
+ logger.info(f"Reached max number of update steps ({args.num_training_steps}). Stopping training.")
406
+ logger.info(f"Rank {global_rank} stopping training.")
407
+ break
408
+
409
+ # forward & backward
410
+ batch = {k: v.to(device) for k, v in batch.items()}
411
+ labels = batch["input_ids"].clone()
412
+ labels[labels == pad_idx] = -100
413
+ tokens_seen += (batch["input_ids"] != pad_idx).sum().item() * world_size
414
+
415
+ loss = model(**batch, labels=labels).loss
416
+
417
+ scaled_loss = loss / args.gradient_accumulation
418
+ scaled_loss.backward()
419
+ accumulated_loss += loss.item() # Issue-4: accumulate before the continue
420
+
421
+ if global_step % args.gradient_accumulation != 0:
422
+ continue
423
+
424
+ # The below code is only executed during the update step
425
+ # Issue-4: compute average loss over all micro-batches in this accumulation window
426
+ avg_loss = accumulated_loss / args.gradient_accumulation
427
+ accumulated_loss = 0.0 # reset for next accumulation window
428
+ # add grad clipping: TODO: add gradient clipping of int8 weight
429
+ if args.grad_clipping != 0.0:
430
+ torch.nn.utils.clip_grad_norm_(trainable_params, args.grad_clipping)
431
+ # Periodic memory cleanup to prevent symbolic tensor issues during long training
432
+ if global_step % args.memory_cleanup_frequency == 0:
433
+ torch.cuda.empty_cache()
434
+ # Clear TorchDynamo cache to prevent memory accumulation
435
+ if args.compile:
436
+ import torch._dynamo
437
+ torch._dynamo.reset()
438
+
439
+ if global_rank == 0:
440
+ pbar.update(1)
441
+ if not layer_wise_flag: # layer-wise updation is done during backward; requires gradient_accumulation equals 1
442
+ optimizer.step()
443
+ scheduler.step()
444
+ optimizer.zero_grad()
445
+
446
+ update_step += 1
447
+ update_time = time.time() - update_time
448
+
449
+ # save checkpoint by save_every
450
+ if local_step > args.gradient_accumulation and update_step % args.save_every == 0 and global_rank == 0:
451
+ current_model_directory = f"{args.save_dir}/model_{update_step}"
452
+ logger.info(f"Saving model and optimizer to {current_model_directory}, update step {update_step}")
453
+ os.makedirs(args.save_dir, exist_ok=True)
454
+ # Bug-1 fix: unwrap DDP/compiled model for saving; works in both single-GPU and multi-GPU modes
455
+ unwrapped_model = model.module if hasattr(model, 'module') else model
456
+ unwrapped_model.save_pretrained(current_model_directory, max_shard_size="500GB", from_pt=True)
457
+ saving_model_weight(unwrapped_model, f"{current_model_directory}/pytorch_model.bin", args)
458
+
459
+ optimizer_checkpoint = {
460
+ "optimizer": optimizer.state_dict(),
461
+ "scheduler": scheduler.state_dict(),
462
+ "update_step": update_step,
463
+ "global_step": global_step,
464
+ "config": run_config,
465
+ "wandb": wandb.run.dir if not args.unset_wandb else None,
466
+ "dtype": args.dtype,
467
+ }
468
+ torch.save(optimizer_checkpoint, f"{current_model_directory}/optimizer.pt")
469
+
470
+ training_state_checkpoint = {
471
+ "global_step": global_step,
472
+ "update_step": update_step,
473
+ "tokens_seen": tokens_seen,
474
+ "tokens_seen_before": tokens_seen_before,
475
+ "update_time": update_time,
476
+ }
477
+ with open(f"{current_model_directory}/training_state.json", "w") as f:
478
+ json.dump(training_state_checkpoint, f, indent=4)
479
+
480
+ # save wandb related info
481
+ if not args.unset_wandb:
482
+ wandb_info = {
483
+ "wandb_id": wandb.run.id,
484
+ }
485
+ with open(f"{args.save_dir}/wandb.json", "w") as f:
486
+ json.dump(wandb_info, f, indent=4)
487
+
488
+ # evaluation
489
+ if update_step % args.eval_every == 0:
490
+ logger.info(f"Performing evaluation at step {update_step}")
491
+ total_loss, evaluated_on_tokens, perplexity = evaluate_model(
492
+ model, tokenizer, pad_idx, global_rank, world_size, device, args
493
+ )
494
+
495
+ if global_rank == 0:
496
+ if not args.unset_wandb:
497
+ wandb.log(
498
+ {
499
+ "eval_loss": total_loss,
500
+ "eval_perplexity": perplexity,
501
+ "eval_tokens": evaluated_on_tokens,
502
+ },
503
+ step=update_step,
504
+ )
505
+ logger.info(f"Eval loss at step {update_step}: {total_loss}, Eval perplexity: {perplexity}")
506
+
507
+ if not layer_wise_flag:
508
+ lr = optimizer.param_groups[0]["lr"]
509
+ else:
510
+ lr = list(optimizer_dict.values())[0].param_groups[0]["lr"]
511
+ tokens_in_update = tokens_seen - tokens_seen_before
512
+ tokens_seen_before = tokens_seen
513
+ batches_in_update = args.gradient_accumulation * world_size
514
+ if not layer_wise_flag:
515
+ total_svd_count = getting_svd_cnt(optimizer)
516
+ else:
517
+ total_svd_count = 0
518
+
519
+ if global_rank == 0:
520
+ if not args.unset_wandb:
521
+ wandb.log(
522
+ {
523
+ "loss": avg_loss,
524
+ "lr": lr,
525
+ "update_step": update_step,
526
+ "tokens_seen": tokens_seen,
527
+ "total_svd_count": total_svd_count,
528
+ "throughput_tokens": tokens_in_update / update_time,
529
+ "throughput_examples": args.total_batch_size / update_time,
530
+ "throughput_batches": batches_in_update / update_time,
531
+ },
532
+ step=update_step,
533
+ )
534
+ update_time = time.time()
535
+
536
+ # ##############################
537
+ # END of training loop
538
+ # ##############################
539
+ logger.info("Training finished")
540
+ if global_rank == 0:
541
+ pbar.close()
542
+
543
+ current_model_directory = f"{args.save_dir}/model_{update_step}"
544
+ if global_rank == 0 and not os.path.exists(current_model_directory):
545
+ logger.info(f"Saving model and optimizer to {current_model_directory}, update step {update_step}")
546
+ os.makedirs(args.save_dir, exist_ok=True)
547
+ # Bug-1 fix: unwrap DDP/compiled model for saving; works in both single-GPU and multi-GPU modes
548
+ unwrapped_model = model.module if hasattr(model, 'module') else model
549
+ unwrapped_model.save_pretrained(current_model_directory, max_shard_size="500GB", from_pt=True)
550
+ saving_model_weight(unwrapped_model, f"{current_model_directory}/pytorch_model.bin", args)
551
+
552
+ optimizer_checkpoint = {
553
+ "optimizer": optimizer.state_dict(),
554
+ "scheduler": scheduler.state_dict(),
555
+ "update_step": update_step,
556
+ "global_step": global_step,
557
+ "config": run_config,
558
+ "wandb": wandb.run.dir if not args.unset_wandb else None,
559
+ "dtype": args.dtype,
560
+ }
561
+ torch.save(optimizer_checkpoint, f"{current_model_directory}/optimizer.pt")
562
+
563
+ training_state_checkpoint = {
564
+ "global_step": global_step,
565
+ "update_step": update_step,
566
+ "tokens_seen": tokens_seen,
567
+ "tokens_seen_before": tokens_seen_before,
568
+ "update_time": update_time,
569
+ }
570
+ with open(f"{current_model_directory}/training_state.json", "w") as f:
571
+ json.dump(training_state_checkpoint, f, indent=4)
572
+
573
+ # Final evaluation
574
+ logger.info("Running final evaluation")
575
+ model.eval()
576
+ del loss, optimizer, scheduler
577
+ import gc
578
+
579
+ gc.collect()
580
+ torch.cuda.empty_cache()
581
+
582
+ total_loss, evaluated_on_tokens, perplexity = evaluate_model(model, tokenizer, pad_idx, global_rank, world_size, device, args)
583
+
584
+ if global_rank == 0:
585
+ if not args.unset_wandb:
586
+ wandb.log(
587
+ {
588
+ "final_eval_loss": total_loss,
589
+ "final_eval_perplexity": perplexity,
590
+ "final_eval_tokens": evaluated_on_tokens,
591
+ },
592
+ step=update_step,
593
+ )
594
+ logger.info(f"Final eval loss: {total_loss}, Final eval perplexity: {perplexity}")
595
+
596
+ logger.info("Script finished successfully")
597
+ print(f"Rank {global_rank} finished successfully")
598
+
599
+
600
+ if __name__ == "__main__":
601
+ print("Starting script")
602
+ args = parse_args(None)
603
+ main(args)
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/files/requirements.txt ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aria2==0.0.1b0
2
+ anyio==4.12.0
3
+ setuptools==80.9.0
4
+ torchvision==0.20.1+cu121
5
+ pyarrow==20.0.0
6
+ peft==0.17.1
7
+ conda-pack==0.8.1
8
+ scikit-learn==1.6.1
9
+ pyparsing==3.3.1
10
+ sympy==1.13.1
11
+ typer-slim==0.20.1
12
+ pip==25.1.1
13
+ pip==25.3
14
+ fonttools==4.60.2
15
+ packaging==25.0
16
+ click==8.1.8
17
+ accelerate==1.10.1
18
+ psutil==7.2.0
19
+ wheel==0.45.1
20
+ multidict==6.7.0
21
+ requests==2.32.5
22
+ async-timeout==5.0.1
23
+ triton==3.1.0
24
+ loguru==0.7.3
25
+ aiohappyeyeballs==2.6.1
26
+ sentry-sdk==2.48.0
27
+ annotated-types==0.7.0
28
+ certifi==2025.11.12
29
+ nvidia-curand-cu12==10.3.2.106
30
+ shellingham==1.5.4
31
+ package_name==0.1
32
+ wandb==0.23.0
33
+ nvitop==1.6.1
34
+ nvidia-nccl-cu12==2.21.5
35
+ nvidia-cublas-cu12==12.1.3.1
36
+ tokenizers==0.22.1
37
+ nvidia-cusparse-cu12==12.1.0.106
38
+ scipy==1.13.1
39
+ propcache==0.4.1
40
+ nvidia-ml-py==13.580.82
41
+ typing_extensions==4.15.0
42
+ sac==0.1.0
43
+ torch-optimizer==0.3.0
44
+ aria2==0.0.1b0
45
+ h11==0.16.0
46
+ pillow==11.3.0
47
+ PyYAML==6.0.3
48
+ six==1.17.0
49
+ GitPython==3.1.45
50
+ addict==2.4.0
51
+ seaborn==0.13.2
52
+ filelock==3.19.1
53
+ modelscope==1.33.0
54
+ et_xmlfile==2.0.0
55
+ regex==2025.11.3
56
+ nvidia-cufft-cu12==11.0.2.54
57
+ nvidia-cuda-cupti-cu12==12.1.105
58
+ lion-pytorch==0.2.3
59
+ matplotlib==3.9.4
60
+ pandas==2.3.2
61
+ gitdb==4.0.12
62
+ kiwisolver==1.4.7
63
+ idna==3.11
64
+ numpy==2.0.2
65
+ nvidia-cuda-runtime-cu12==12.1.105
66
+ httpx==0.28.1
67
+ frozenlist==1.8.0
68
+ smmap==5.0.2
69
+ datasets==2.14.0
70
+ yarl==1.22.0
71
+ eval_type_backport==0.3.1
72
+ nvidia-cuda-nvrtc-cu12==12.1.105
73
+ huggingface-hub==0.36.0
74
+ torchaudio==2.5.1+cu121
75
+ aiosignal==1.4.0
76
+ importlib_resources==6.5.2
77
+ nvidia-cusolver-cu12==11.4.5.107
78
+ networkx==3.2.1
79
+ tzdata==2025.3
80
+ bitsandbytes==0.42.0
81
+ cycler==0.12.1
82
+ jq==1.10.0
83
+ mpmath==1.3.0
84
+ pydantic_core==2.41.5
85
+ nvidia-cudnn-cu12==9.1.0.70
86
+ typing-inspection==0.4.2
87
+ httpcore==1.0.9
88
+ nvidia-nvtx-cu12==12.1.105
89
+ platformdirs==4.4.0
90
+ MarkupSafe==2.1.5
91
+ multiprocess==0.70.15
92
+ zipp==3.23.0
93
+ transformers==4.57.3
94
+ nvidia-nvjitlink-cu12==12.9.86
95
+ exceptiongroup==1.3.1
96
+ pydantic==2.12.5
97
+ charset-normalizer==3.4.4
98
+ joblib==1.5.3
99
+ dill==0.3.7
100
+ fsspec==2023.9.2
101
+ torch==2.5.1+cu121
102
+ aiohttp==3.13.2
103
+ urllib3==2.6.2
104
+ apollo-torch==1.0.3
105
+ contourpy==1.3.0
106
+ evaluate==0.4.6
107
+ attrs==25.4.0
108
+ pytz==2025.2
109
+ safetensors==0.7.0
110
+ pytorch-ranger==0.1.1
111
+ threadpoolctl==3.6.0
112
+ Jinja2==3.1.6
113
+ protobuf==6.33.2
114
+ python-dateutil==2.9.0.post0
115
+ xxhash==3.6.0
116
+ openpyxl==3.1.5
117
+ hf-xet==1.2.0
118
+ tqdm==4.67.1
119
+ jaraco.context==5.3.0
120
+ platformdirs==4.2.2
121
+ importlib_metadata==8.0.0
122
+ more-itertools==10.3.0
123
+ typing_extensions==4.12.2
124
+ autocommand==2.2.2
125
+ wheel==0.45.1
126
+ zipp==3.19.2
127
+ packaging==24.2
128
+ backports.tarfile==1.2.0
129
+ inflect==7.3.1
130
+ typeguard==4.3.0
131
+ jaraco.functools==4.0.1
132
+ jaraco.collections==5.1.0
133
+ jaraco.text==3.12.1
134
+ tomli==2.0.1
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug-core.log ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-19T23:47:18.021764833+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpo3l032g6/port-98537.txt","pid":98537,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-19T23:47:18.029666993+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":98537}
3
+ {"time":"2026-04-19T23:47:18.029927965+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-98537-99286-355652662/socket","Net":"unix"}}
4
+ {"time":"2026-04-19T23:47:18.048284469+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-19T23:47:18.078185493+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpnvqgmy5i/port-98542.txt","pid":98542,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
6
+ {"time":"2026-04-19T23:47:18.078992412+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":98542}
7
+ {"time":"2026-04-19T23:47:18.078970839+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-98542-99304-3007006831/socket","Net":"unix"}}
8
+ {"time":"2026-04-19T23:47:18.080307948+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"21tlsf0i","id":"1(@)"}
9
+ {"time":"2026-04-19T23:47:18.135684998+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmprq6n2ybz/port-76073.txt","pid":76073,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
10
+ {"time":"2026-04-19T23:47:18.137529876+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":76073}
11
+ {"time":"2026-04-19T23:47:18.137591396+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-76073-76853-466182552/socket","Net":"unix"}}
12
+ {"time":"2026-04-19T23:47:18.187427529+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
13
+ {"time":"2026-04-19T23:47:18.220860482+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
14
+ {"time":"2026-04-19T23:47:18.209949557+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmprok4hglu/port-99742.txt","pid":99742,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
15
+ {"time":"2026-04-19T23:47:18.209951784+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpjiasccxf/port-99748.txt","pid":99748,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
16
+ {"time":"2026-04-19T23:47:18.211453276+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":99742}
17
+ {"time":"2026-04-19T23:47:18.211457149+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":99748}
18
+ {"time":"2026-04-19T23:47:18.211514611+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-99742-101551-3085090515/socket","Net":"unix"}}
19
+ {"time":"2026-04-19T23:47:18.218061611+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"yiy34l67","id":"1(@)"}
20
+ -101562-3832278309/socket","Net":"unix"}}
21
+ {"time":"2026-04-19T23:47:18.269877528+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"fdnbp7cl","id":"1(@)"}
22
+ {"time":"2026-04-19T23:47:18.271090519+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
23
+ {"time":"2026-04-19T23:47:18.283121843+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"21tlsf0i","id":"1(@)"}
24
+ {"time":"2026-04-19T23:47:18.286926828+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpggxenr18/port-111040.txt","pid":111040,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
25
+ {"time":"2026-04-19T23:47:18.28693122+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpsnt1dchv/port-111045.txt","pid":111045,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
26
+ {"time":"2026-04-19T23:47:18.300789277+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpc3ed2r3u/port-111854.txt","pid":111854,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
27
+ {"time":"2026-04-19T23:47:18.302607546+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":111854}
28
+ {"time":"2026-04-19T23:47:18.302683903+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-111854-112701-4238633932/socket","Net":"unix"}}
29
+ {"time":"2026-04-19T23:47:18.305913132+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpmfcwim_m/port-76068.txt","pid":76068,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
30
+ {"time":"2026-04-19T23:47:18.306960611+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":76068}
31
+ {"time":"2026-04-19T23:47:18.306953648+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-76068-76876-1915591522/socket","Net":"unix"}}
32
+ {"time":"2026-04-19T23:47:18.306095082+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"85n8tjn3","id":"1(@)"}
33
+ {"time":"2026-04-19T23:47:18.314845902+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
34
+ {"time":"2026-04-19T23:47:18.288617729+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":111040}
35
+ {"time":"2026-04-19T23:47:18.327282252+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":111045}
36
+ {"time":"2026-04-19T23:47:18.288631687+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-111040-112000-1180210445/socket","Net":"unix"}}
37
+ {"time":"2026-04-19T23:47:18.327291275+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-111045-112001-2453585409/socket","Net":"unix"}}
38
+ {"time":"2026-04-19T23:47:18.346654253+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"vrvs25ab","id":"1(@)"}
39
+ {"time":"2026-04-19T23:47:18.381947133+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
40
+ {"time":"2026-04-19T23:47:18.39046355+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
41
+ {"time":"2026-04-19T23:47:18.397039905+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
42
+ {"time":"2026-04-19T23:47:18.401615357+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yiy34l67","id":"1(@)"}
43
+ {"time":"2026-04-19T23:47:18.410483286+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
44
+ {"time":"2026-04-19T23:47:18.415315815+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"22i9c0l8","id":"1(@)"}
45
+ {"time":"2026-04-19T23:47:18.421928169+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"hjmiiwpe","id":"1(@)"}
46
+ {"time":"2026-04-19T23:47:18.42686866+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"dhgtud9k","id":"1(@)"}
47
+ {"time":"2026-04-19T23:47:18.439541955+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmphtl9f_ni/port-111849.txt","pid":111849,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
48
+ {"time":"2026-04-19T23:47:18.440374904+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":111849}
49
+ {"time":"2026-04-19T23:47:18.4403598+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-111849-112720-568316116/socket","Net":"unix"}}
50
+ {"time":"2026-04-19T23:47:18.44697126+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"epujs7qw","id":"1(@)"}
51
+ {"time":"2026-04-19T23:47:18.461429221+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"fdnbp7cl","id":"1(@)"}
52
+ {"time":"2026-04-19T23:47:18.526899886+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
53
+ {"time":"2026-04-19T23:47:18.559484533+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"pafttcq9","id":"1(@)"}
54
+ {"time":"2026-04-19T23:47:18.581911568+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpo4jua6ux/port-98798.txt","pid":98798,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
55
+ {"time":"2026-04-19T23:47:18.581921661+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp9zkkecpk/port-98792.txt","pid":98792,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
56
+ {"time":"2026-04-19T23:47:18.583091474+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":98798}
57
+ {"time":"2026-04-19T23:47:18.583314014+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":98792}
58
+ {"time":"2026-04-19T23:47:18.583106123+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-98798-99747-3528434782/socket","Net":"unix"}}
59
+ {"time":"2026-04-19T23:47:18.583312585+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-98792-99748-2045978848/socket","Net":"unix"}}
60
+ {"time":"2026-04-19T23:47:18.617541968+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"dhgtud9k","id":"1(@)"}
61
+ {"time":"2026-04-19T23:47:18.618998938+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"22i9c0l8","id":"1(@)"}
62
+ {"time":"2026-04-19T23:47:18.636792403+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"hjmiiwpe","id":"1(@)"}
63
+ {"time":"2026-04-19T23:47:18.641453678+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"epujs7qw","id":"1(@)"}
64
+ {"time":"2026-04-19T23:47:18.675611891+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
65
+ {"time":"2026-04-19T23:47:18.680131324+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpiw5twogx/port-88639.txt","pid":88639,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
66
+ {"time":"2026-04-19T23:47:18.681711769+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":88639}
67
+ {"time":"2026-04-19T23:47:18.681746341+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-88639-89639-2998582682/socket","Net":"unix"}}
68
+ {"time":"2026-04-19T23:47:18.685958921+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
69
+ {"time":"2026-04-19T23:47:18.710526608+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"yljaz7hy","id":"1(@)"}
70
+ {"time":"2026-04-19T23:47:18.704563215+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"85n8tjn3","id":"1(@)"}
71
+ {"time":"2026-04-19T23:47:18.719501488+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"dujhio3b","id":"1(@)"}
72
+ {"time":"2026-04-19T23:47:18.732251608+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"vrvs25ab","id":"1(@)"}
73
+ {"time":"2026-04-19T23:47:18.751633063+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"pafttcq9","id":"1(@)"}
74
+ {"time":"2026-04-19T23:47:18.75616279+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
75
+ {"time":"2026-04-19T23:47:18.797761417+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"xqvrxii2","id":"1(@)"}
76
+ {"time":"2026-04-19T23:47:18.834384637+08:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmph_rkjoy0/port-88645.txt","pid":88645,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
77
+ {"time":"2026-04-19T23:47:18.835396377+08:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":88645}
78
+ {"time":"2026-04-19T23:47:18.835405+08:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-88645-89660-346707904/socket","Net":"unix"}}
79
+ {"time":"2026-04-19T23:47:18.927501062+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"yljaz7hy","id":"1(@)"}
80
+ {"time":"2026-04-19T23:47:18.930167495+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"dujhio3b","id":"1(@)"}
81
+ {"time":"2026-04-19T23:47:18.933169858+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
82
+ {"time":"2026-04-19T23:47:18.969120085+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"r75330cz","id":"1(@)"}
83
+ {"time":"2026-04-19T23:47:19.023847579+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"xqvrxii2","id":"1(@)"}
84
+ {"time":"2026-04-19T23:47:19.172783549+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"r75330cz","id":"1(@)"}
85
+ {"time":"2026-04-20T01:08:49.146604788+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
86
+ {"time":"2026-04-20T01:08:49.147864857+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
87
+ {"time":"2026-04-20T01:08:49.148205319+08:00","level":"INFO","msg":"server is shutting down"}
88
+ {"time":"2026-04-20T01:08:49.148618805+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
89
+ {"time":"2026-04-20T01:08:49.149735433+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-98537-99286-355652662/socket","Net":"unix"}}
90
+ {"time":"2026-04-20T01:08:49.154374103+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
91
+ {"time":"2026-04-20T01:08:49.154860689+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
92
+ {"time":"2026-04-20T01:08:49.155311814+08:00","level":"INFO","msg":"server is closed"}
93
+ {"time":"2026-04-20T01:08:49.253458485+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
94
+ {"time":"2026-04-20T01:08:49.25398645+08:00","level":"INFO","msg":"server is shutting down"}
95
+ {"time":"2026-04-20T01:08:49.253975474+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
96
+ {"time":"2026-04-20T01:08:49.254456653+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-99742-101551-3085090515/socket","Net":"unix"}}
97
+ {"time":"2026-04-20T01:08:49.254838526+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
98
+ {"time":"2026-04-20T01:08:49.314298109+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
99
+ {"time":"2026-04-20T01:08:49.314745275+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
100
+ {"time":"2026-04-20T01:08:49.31510503+08:00","level":"INFO","msg":"server is closed"}
101
+ {"time":"2026-04-20T01:09:04.323031847+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
102
+ {"time":"2026-04-20T01:09:04.323740325+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
103
+ {"time":"2026-04-20T01:09:04.324357481+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
104
+ {"time":"2026-04-20T01:09:04.323760517+08:00","level":"INFO","msg":"server is shutting down"}
105
+ {"time":"2026-04-20T01:09:04.325195807+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-99748-101562-3832278309/socket","Net":"unix"}}
106
+ {"time":"2026-04-20T01:09:04.326737777+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
107
+ {"time":"2026-04-20T01:09:04.327166361+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
108
+ {"time":"2026-04-20T01:09:04.327563129+08:00","level":"INFO","msg":"server is closed"}
109
+ {"time":"2026-04-20T01:09:11.41740719+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
110
+ {"time":"2026-04-20T01:09:11.417884577+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
111
+ {"time":"2026-04-20T01:09:11.418297371+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
112
+ {"time":"2026-04-20T01:09:11.417905717+08:00","level":"INFO","msg":"server is shutting down"}
113
+ {"time":"2026-04-20T01:09:11.419118842+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-98542-99304-3007006831/socket","Net":"unix"}}
114
+ {"time":"2026-04-20T01:09:11.421316849+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
115
+ {"time":"2026-04-20T01:09:11.421698457+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
116
+ {"time":"2026-04-20T01:09:11.42207015+08:00","level":"INFO","msg":"server is closed"}
117
+ {"time":"2026-04-20T01:10:48.070345078+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
118
+ {"time":"2026-04-20T01:10:48.070932583+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
119
+ {"time":"2026-04-20T01:10:48.071464299+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
120
+ {"time":"2026-04-20T01:10:48.070946329+08:00","level":"INFO","msg":"server is shutting down"}
121
+ {"time":"2026-04-20T01:10:48.072287025+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-111849-112720-568316116/socket","Net":"unix"}}
122
+ {"time":"2026-04-20T01:10:48.074076319+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
123
+ {"time":"2026-04-20T01:10:48.074486591+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
124
+ {"time":"2026-04-20T01:10:48.074881194+08:00","level":"INFO","msg":"server is closed"}
125
+ {"time":"2026-04-20T01:10:55.161432161+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
126
+ {"time":"2026-04-20T01:10:55.162122358+08:00","level":"INFO","msg":"server is shutting down"}
127
+ {"time":"2026-04-20T01:10:55.162108411+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
128
+ {"time":"2026-04-20T01:10:55.162692909+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-111854-112701-4238633932/socket","Net":"unix"}}
129
+ {"time":"2026-04-20T01:10:55.163102651+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
130
+ {"time":"2026-04-20T01:10:55.166362966+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
131
+ {"time":"2026-04-20T01:10:55.166838401+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
132
+ {"time":"2026-04-20T01:10:55.167285135+08:00","level":"INFO","msg":"server is closed"}
133
+ {"time":"2026-04-20T01:11:27.283435067+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
134
+ {"time":"2026-04-20T01:11:27.284057323+08:00","level":"INFO","msg":"server is shutting down"}
135
+ {"time":"2026-04-20T01:11:27.28403544+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
136
+ {"time":"2026-04-20T01:11:27.284585607+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-98792-99748-2045978848/socket","Net":"unix"}}
137
+ {"time":"2026-04-20T01:11:27.28520036+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
138
+ {"time":"2026-04-20T01:11:27.287750736+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
139
+ {"time":"2026-04-20T01:11:27.288560815+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
140
+ {"time":"2026-04-20T01:11:27.288970908+08:00","level":"INFO","msg":"server is closed"}
141
+ {"time":"2026-04-20T01:11:42.295715754+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
142
+ {"time":"2026-04-20T01:11:42.296280122+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
143
+ {"time":"2026-04-20T01:11:42.296821181+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
144
+ {"time":"2026-04-20T01:11:42.296291028+08:00","level":"INFO","msg":"server is shutting down"}
145
+ {"time":"2026-04-20T01:11:42.297786054+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-98798-99747-3528434782/socket","Net":"unix"}}
146
+ {"time":"2026-04-20T01:11:42.300652964+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
147
+ {"time":"2026-04-20T01:11:42.301119878+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
148
+ {"time":"2026-04-20T01:11:42.301481215+08:00","level":"INFO","msg":"server is closed"}
149
+ {"time":"2026-04-20T09:55:25.042583189+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
150
+ {"time":"2026-04-20T09:55:25.043601928+08:00","level":"INFO","msg":"server is shutting down"}
151
+ {"time":"2026-04-20T09:55:25.043597224+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
152
+ {"time":"2026-04-20T09:55:25.044129205+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-111040-112000-1180210445/socket","Net":"unix"}}
153
+ {"time":"2026-04-20T09:55:25.04456338+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
154
+ {"time":"2026-04-20T09:55:25.048450045+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
155
+ {"time":"2026-04-20T09:55:25.048937508+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
156
+ {"time":"2026-04-20T09:55:25.049364415+08:00","level":"INFO","msg":"server is closed"}
157
+ {"time":"2026-04-20T09:55:26.509170259+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
158
+ {"time":"2026-04-20T09:55:26.509792575+08:00","level":"INFO","msg":"server is shutting down"}
159
+ {"time":"2026-04-20T09:55:26.509787671+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
160
+ {"time":"2026-04-20T09:55:26.510719316+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
161
+ {"time":"2026-04-20T09:55:26.510282741+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-111045-112001-2453585409/socket","Net":"unix"}}
162
+ {"time":"2026-04-20T09:55:26.513872593+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
163
+ {"time":"2026-04-20T09:55:26.514283106+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
164
+ {"time":"2026-04-20T09:55:26.514665103+08:00","level":"INFO","msg":"server is closed"}
165
+ {"time":"2026-04-20T09:57:18.487273701+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
166
+ {"time":"2026-04-20T09:57:18.48779167+08:00","level":"INFO","msg":"server is shutting down"}
167
+ {"time":"2026-04-20T09:57:18.487782288+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
168
+ {"time":"2026-04-20T09:57:18.488283466+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-76073-76853-466182552/socket","Net":"unix"}}
169
+ {"time":"2026-04-20T09:57:18.488647164+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
170
+ {"time":"2026-04-20T09:57:18.500524213+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
171
+ {"time":"2026-04-20T09:57:18.500969049+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
172
+ {"time":"2026-04-20T09:57:18.501359083+08:00","level":"INFO","msg":"server is closed"}
173
+ {"time":"2026-04-20T09:57:34.947234839+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
174
+ {"time":"2026-04-20T09:57:34.947735354+08:00","level":"INFO","msg":"server is shutting down"}
175
+ {"time":"2026-04-20T09:57:34.947723917+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
176
+ {"time":"2026-04-20T09:57:34.94818644+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-76068-76876-1915591522/socket","Net":"unix"}}
177
+ {"time":"2026-04-20T09:57:34.94854198+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
178
+ {"time":"2026-04-20T09:57:34.950855281+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
179
+ {"time":"2026-04-20T09:57:34.951263279+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
180
+ {"time":"2026-04-20T09:57:34.951650899+08:00","level":"INFO","msg":"server is closed"}
181
+ {"time":"2026-04-20T10:01:41.014666105+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
182
+ {"time":"2026-04-20T10:01:41.0152138+08:00","level":"INFO","msg":"server is shutting down"}
183
+ {"time":"2026-04-20T10:01:41.015205054+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
184
+ {"time":"2026-04-20T10:01:41.01569071+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-88639-89639-2998582682/socket","Net":"unix"}}
185
+ {"time":"2026-04-20T10:01:41.01608462+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
186
+ {"time":"2026-04-20T10:01:41.019915307+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
187
+ {"time":"2026-04-20T10:01:41.020254481+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
188
+ {"time":"2026-04-20T10:01:41.02057197+08:00","level":"INFO","msg":"server is closed"}
189
+ {"time":"2026-04-20T10:02:09.247682946+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
190
+ {"time":"2026-04-20T10:02:09.248225032+08:00","level":"INFO","msg":"connection: closing","id":"1(@)"}
191
+ {"time":"2026-04-20T10:02:09.248235825+08:00","level":"INFO","msg":"server is shutting down"}
192
+ {"time":"2026-04-20T10:02:09.248683569+08:00","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
193
+ {"time":"2026-04-20T10:02:09.249161406+08:00","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-88645-89660-346707904/socket","Net":"unix"}}
194
+ {"time":"2026-04-20T10:02:09.251438194+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
195
+ {"time":"2026-04-20T10:02:09.251829309+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
196
+ {"time":"2026-04-20T10:02:09.252180149+08:00","level":"INFO","msg":"server is closed"}
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug-internal.log ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-19T23:47:18.48078063+08:00","level":"INFO","msg":"stream: starting","core version":"0.23.0"}
2
+ {"time":"2026-04-19T23:47:18.612202918+08:00","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-04-19T23:47:18.612265953+08:00","level":"INFO","msg":"stream: created new stream","id":"dhgtud9k"}
4
+ {"time":"2026-04-19T23:47:18.612302459+08:00","level":"INFO","msg":"handler: started","stream_id":"dhgtud9k"}
5
+ {"time":"2026-04-19T23:47:18.617535039+08:00","level":"INFO","msg":"stream: started","id":"dhgtud9k"}
6
+ {"time":"2026-04-19T23:47:18.617543876+08:00","level":"INFO","msg":"writer: started","stream_id":"dhgtud9k"}
7
+ {"time":"2026-04-19T23:47:18.617549943+08:00","level":"INFO","msg":"sender: started","stream_id":"dhgtud9k"}
8
+ {"time":"2026-04-19T23:47:18.618872114+08:00","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
9
+ {"time":"2026-04-20T09:55:26.509801244+08:00","level":"INFO","msg":"stream: closing","id":"dhgtud9k"}
10
+ {"time":"2026-04-20T09:55:26.510476014+08:00","level":"INFO","msg":"handler: closed","stream_id":"dhgtud9k"}
11
+ {"time":"2026-04-20T09:55:26.511606678+08:00","level":"INFO","msg":"sender: closed","stream_id":"dhgtud9k"}
12
+ {"time":"2026-04-20T09:55:26.511618561+08:00","level":"INFO","msg":"stream: closed","id":"dhgtud9k"}
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-19 23:47:17,983 INFO MainThread:111045 [wandb_setup.py:_flush():80] Current SDK version is 0.23.0
2
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Configure stats pid to 111045
3
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Loading settings from /mnt/petrelfs/panjiabao/.config/wandb/settings
4
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Loading settings from /mnt/petrelfs/panjiabao/Optimizer/SAC/wandb/settings
5
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:setup_run_log_directory():713] Logging user logs to /mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug.log
7
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:setup_run_log_directory():714] Logging internal logs to /mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/logs/debug-internal.log
8
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:init():840] calling init triggers
9
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:init():845] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2026-04-19 23:47:17,984 INFO MainThread:111045 [wandb_init.py:init():888] starting backend
12
+ 2026-04-19 23:47:18,397 INFO MainThread:111045 [wandb_init.py:init():891] sending inform_init request
13
+ 2026-04-19 23:47:18,425 INFO MainThread:111045 [wandb_init.py:init():899] backend started and connected
14
+ 2026-04-19 23:47:18,429 INFO MainThread:111045 [wandb_init.py:init():969] updated telemetry
15
+ 2026-04-19 23:47:18,474 INFO MainThread:111045 [wandb_init.py:init():993] communicating run to backend with 90.0 second timeout
16
+ 2026-04-19 23:47:18,620 INFO MainThread:111045 [wandb_init.py:init():1040] starting run threads in backend
17
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_console_start():2504] atexit reg
18
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_redirect():2352] redirect: wrap_raw
19
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_redirect():2421] Wrapping output streams.
20
+ 2026-04-19 23:47:19,015 INFO MainThread:111045 [wandb_run.py:_redirect():2444] Redirects installed.
21
+ 2026-04-19 23:47:19,023 INFO MainThread:111045 [wandb_init.py:init():1080] run started, returning control to user process
22
+ 2026-04-19 23:47:27,035 INFO MainThread:111045 [wandb_run.py:_config_callback():1385] config_cb None None {'model_config': 'configs/llama_350m.json', 'exp_config': 'exp_v2/configs/llama_350m_apollo.json', 'eval_every': 1000, 'save_every': 60000, 'dtype': 'bfloat16', 'seed': 0, 'compile': True, 'dynamo_suppress_errors': True, 'dynamo_cache_limit': 10000, 'memory_cleanup_frequency': 10000, 'resume_step': None, 'restore_optimizer': False, 'continue_from': None, 'single_gpu': False, 'save_dir': '/mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621', 'use_hf_model': False, 'workers': 12, 'batch_size': 128, 'gradient_accumulation': 1, 'total_batch_size': 512, 'warmup_steps': 6000, 'num_training_steps': 60000, 'max_train_tokens': None, 'optimizer': 'apollo_adamw', 'max_length': 256, 'scheduler': 'cosine', 'min_lr_ratio': 0.1, 'weight_decay': 0.0, 'grad_clipping': 0.0, 'activation_checkpointing': False, 'data_path': '/mnt/dhwfile/tancheng/panjiabao/dataset/C4/en', 'data_name': 'en', 'tags': None, 'name': 'test', 'project': 'test', 'unset_wandb': False, 'entity': None, 'wandb_dir': '/mnt/dhwfile/tancheng/panjiabao/Result/SAC_C4/work_dirs/350m/apollo_sweep_lr1e-2_20260419_234621', 'beta1': 0.9, 'beta2': 0.99, 'beta3': 0.99, 'eps': 1e-06, 'rank': 256, 'update_proj_gap': 200, 'galore_scale': 1.0, 'proj_type': 'std', 'proj_quant': False, 'proj_bits': 8, 'proj_group_size': 256, 'weight_quant': False, 'weight_bits': 8, 'weight_group_size': 256, 'stochastic_round': False, 'simulation': False, 'cos_threshold': 1, 'gamma_proj': 2, 'queue_size': 5, 'proj': 'random', 'scale_type': 'channel', 'apollo_scale': 1, 'scale_front': False, 'n_clusters': 3, 'scale_update_freq': 500, 'scale_level': '1,0,1,1', 'scale_bound': None, 'metric': 'mean', 'align_grad': False, 'dim': 4096, 'n_heads': 32, 'muon_ns_steps': 5, 'muon_momentum': 0.95, 'nproc_per_node': 4, 'max_lr': 0.01, 'total_params_M': 367.96928, 'dataset': 'c4', 'model': {'vocab_size': 32000, 'max_position_embeddings': 2048, 'hidden_size': 1024, 'intermediate_size': 2736, 'num_hidden_layers': 24, 'num_attention_heads': 16, 'num_key_value_heads': 16, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'pretraining_tp': 1, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': None, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['LLaMAForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 0, 'pad_token_id': -1, 'eos_token_id': 1, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': 'configs/llama_350m.json', 'transformers_version': '4.57.3', 'max_sequence_length': 1024, 'model_type': 'llama', 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False}, 'world_size': 4, 'device': 'cuda:0'}
23
+ 2026-04-20 09:55:26,509 INFO wandb-AsyncioManager-main:111045 [service_client.py:_forward_responses():80] Reached EOF.
24
+ 2026-04-20 09:55:26,510 INFO wandb-AsyncioManager-main:111045 [mailbox.py:close():137] Closing mailbox, abandoning 0 handles.
350m/apollo_lr1e_2_b1_0_9_b2_0_99_eps_1e_6_scale_1_rank_256_T_200_A100_ppl_16_4294_20260419_234621/wandb/offline-run-20260419_234717-dhgtud9k/run-dhgtud9k.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28ea4d8c8e24a0220fddb167b682cecf4d1e93c2cd9fab1a020e785a641a256c
3
+ size 63354972