narySt commited on
Commit
a0cf2a5
·
verified ·
1 Parent(s): 78b7da7

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. routing_rerun/N_2.5/.hydra/config.yaml +57 -0
  2. routing_rerun/N_2.5/.hydra/hydra.yaml +162 -0
  3. routing_rerun/N_2.5/.hydra/overrides.yaml +2 -0
  4. routing_rerun/N_2.5/eval_results/eval_config.yaml +29 -0
  5. routing_rerun/N_2.5/eval_results/metrics_model_best.txt +17 -0
  6. routing_rerun/N_2.5/eval_results/metrics_model_final.txt +17 -0
  7. routing_rerun/N_2.5/eval_results/metrics_pretrained.txt +17 -0
  8. routing_rerun/N_2.5/eval_results/predictions_model_best.txt +0 -0
  9. routing_rerun/N_2.5/eval_results/predictions_model_final.txt +0 -0
  10. routing_rerun/N_2.5/eval_results/predictions_pretrained.txt +0 -0
  11. routing_rerun/N_2.5/eval_results/summary.txt +8 -0
  12. routing_rerun/N_2.5/train.log +0 -0
  13. routing_rerun/N_2.5/wandb/debug-internal.log +13 -0
  14. routing_rerun/N_2.5/wandb/debug.log +24 -0
  15. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/code/routing_evolution_exp/train.py +259 -0
  16. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/config.yaml +132 -0
  17. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/output.log +108 -0
  18. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/requirements.txt +245 -0
  19. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/wandb-metadata.json +44 -0
  20. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/wandb-summary.json +1 -0
  21. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug-core.log +14 -0
  22. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug-internal.log +14 -0
  23. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug.log +21 -0
  24. routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/run-l6q9x03b.wandb +0 -0
  25. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/code/routing_evolution_exp/train.py +259 -0
  26. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/config.yaml +134 -0
  27. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/output.log +0 -0
  28. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/requirements.txt +245 -0
  29. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/wandb-metadata.json +44 -0
  30. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/wandb-summary.json +1 -0
  31. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug-core.log +16 -0
  32. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug-internal.log +13 -0
  33. routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug.log +24 -0
  34. routing_rerun/N_4.0/.hydra/config.yaml +57 -0
  35. routing_rerun/N_4.0/.hydra/hydra.yaml +162 -0
  36. routing_rerun/N_4.0/.hydra/overrides.yaml +2 -0
  37. routing_rerun/N_4.0/eval_results/eval_config.yaml +29 -0
  38. routing_rerun/N_4.0/eval_results/metrics_model_best.txt +17 -0
  39. routing_rerun/N_4.0/eval_results/metrics_model_final.txt +17 -0
  40. routing_rerun/N_4.0/eval_results/metrics_pretrained.txt +17 -0
  41. routing_rerun/N_4.0/eval_results/predictions_model_best.txt +0 -0
  42. routing_rerun/N_4.0/eval_results/predictions_model_final.txt +0 -0
  43. routing_rerun/N_4.0/eval_results/predictions_pretrained.txt +0 -0
  44. routing_rerun/N_4.0/eval_results/summary.txt +8 -0
  45. routing_rerun/N_4.0/train.log +0 -0
  46. routing_rerun/N_4.0/wandb/debug-internal.log +14 -0
  47. routing_rerun/N_4.0/wandb/debug.log +24 -0
  48. routing_rerun/N_4.0/wandb/run-20260524_000855-j30yot8x/logs/debug.log +24 -0
  49. routing_rerun/N_6.0/train.log +0 -0
  50. routing_rerun/N_8.0/train.log +0 -0
routing_rerun/N_2.5/.hydra/config.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 3
6
+ max_steps: null
7
+ batch_size: 8
8
+ eval_batch_size: 24
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ lr_multiplier:
22
+ - 2.0
23
+ - 1.5
24
+ - 1.0
25
+ load_balancing_weight: 0.5
26
+ load_balancing_N: 2.5
27
+ max_grad_norm: 1.0
28
+ use_amp: true
29
+ resume: false
30
+ resume_checkpoint: null
31
+ warmup_model: true
32
+ data:
33
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
34
+ max_context_len: 4096
35
+ max_target_len: 256
36
+ num_workers: 0
37
+ pin_memory: true
38
+ max_train_samples: 150000
39
+ max_val_samples: null
40
+ logging:
41
+ log_interval: 10
42
+ save_interval: 3000
43
+ eval_interval: 1000
44
+ save_every_epoch: false
45
+ model_only_checkpoints: true
46
+ tracking:
47
+ enabled: true
48
+ backend: wandb
49
+ project: routing-evolution
50
+ run_name: routing_N2.5
51
+ entity: null
52
+ base_url: https://wandb.platun0v.ru
53
+ local_dir: ${paths.output_dir}
54
+ paths:
55
+ output_dir: outputs/${now:%Y-%m-%d}/N_${training.load_balancing_N}
56
+ seed: 42
57
+ device: cuda
routing_rerun/N_2.5/.hydra/hydra.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - training.load_balancing_N=2.5
116
+ - tracking.run_name=routing_N2.5
117
+ job:
118
+ name: train
119
+ chdir: false
120
+ override_dirname: tracking.run_name=routing_N2.5,training.load_balancing_N=2.5
121
+ id: ???
122
+ num: ???
123
+ config_name: config
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /workspace/byte-llms-code/routing_evolution_exp
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /workspace/byte-llms-code/routing_evolution_exp/configs
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/2026-05-23/N_2.5
146
+ choices:
147
+ paths: default
148
+ tracking: default
149
+ logging: default
150
+ data: default
151
+ training: default
152
+ model: hnet_xl_code
153
+ hydra/env: default
154
+ hydra/callbacks: null
155
+ hydra/job_logging: default
156
+ hydra/hydra_logging: default
157
+ hydra/hydra_help: default
158
+ hydra/help: default
159
+ hydra/sweeper: basic
160
+ hydra/launcher: basic
161
+ hydra/output: default
162
+ verbose: false
routing_rerun/N_2.5/.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - training.load_balancing_N=2.5
2
+ - tracking.run_name=routing_N2.5
routing_rerun/N_2.5/eval_results/eval_config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ data:
5
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
6
+ max_context_len: 4096
7
+ max_target_len: 256
8
+ num_workers: 0
9
+ pin_memory: true
10
+ max_train_samples: 150000
11
+ max_val_samples: null
12
+ evaluation:
13
+ batch_size: 16
14
+ max_samples: 2000
15
+ compute_bpb: true
16
+ bleu_tokenize: none
17
+ use_amp: true
18
+ save_predictions: true
19
+ generation:
20
+ max_length: 256
21
+ temperature: 0.1
22
+ top_k: 0
23
+ top_p: 1.0
24
+ paths:
25
+ run_dir: outputs/2026-05-23/N_2.5
26
+ eval_initial: true
27
+ eval_final: true
28
+ seed: 42
29
+ device: cuda
routing_rerun/N_2.5/eval_results/metrics_model_best.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_best (outputs/2026-05-23/N_2.5/model_best.pt)
2
+ ================================================================================
3
+
4
+ exact_match: 0.3755
5
+ token_accuracy: 0.4797733652504865
6
+ bleu: 19.493683848607724
7
+ bpb: 1.328336302401117
8
+ num_samples: 2000
9
+ gen_wall_time_s: 377.6293431438389
10
+ gen_samples_per_s: 5.2961986040322095
11
+ gen_time_per_sample_ms: 188.81467157191946
12
+ gen_chars_per_s: 145.518352844389
13
+ gen_batch_mean_ms: 3021.0347451507114
14
+ gen_batch_p50_ms: 2677.273398032412
15
+ gen_batch_p95_ms: 6516.014387598256
16
+ gen_batch_max_ms: 14755.625324964058
17
+ gen_num_batches: 125
routing_rerun/N_2.5/eval_results/metrics_model_final.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_final (outputs/2026-05-23/N_2.5/model_final.pt)
2
+ ================================================================================
3
+
4
+ exact_match: 0.3565
5
+ token_accuracy: 0.475812423579658
6
+ bleu: 18.406700285414495
7
+ bpb: 1.4244987555298723
8
+ num_samples: 2000
9
+ gen_wall_time_s: 461.3162157420302
10
+ gen_samples_per_s: 4.335420979691743
11
+ gen_time_per_sample_ms: 230.6581078710151
12
+ gen_chars_per_s: 122.02909431538349
13
+ gen_batch_mean_ms: 3690.5297259362414
14
+ gen_batch_p50_ms: 3152.6444799965248
15
+ gen_batch_p95_ms: 7729.715668002599
16
+ gen_batch_max_ms: 16093.165102996863
17
+ gen_num_batches: 125
routing_rerun/N_2.5/eval_results/metrics_pretrained.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: pretrained (/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt)
2
+ ================================================================================
3
+
4
+ exact_match: 0.0
5
+ token_accuracy: 0.46733945270119
6
+ bleu: 1.1096613769578676
7
+ bpb: 1.876943169957491
8
+ num_samples: 2000
9
+ gen_wall_time_s: 1815.3716653165757
10
+ gen_samples_per_s: 1.1017027742642593
11
+ gen_time_per_sample_ms: 907.6858326582878
12
+ gen_chars_per_s: 281.1892516296283
13
+ gen_batch_mean_ms: 14522.973322532605
14
+ gen_batch_p50_ms: 14979.340799036436
15
+ gen_batch_p95_ms: 15534.628321009222
16
+ gen_batch_max_ms: 34090.05431999685
17
+ gen_num_batches: 125
routing_rerun/N_2.5/eval_results/predictions_model_best.txt ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_2.5/eval_results/predictions_model_final.txt ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_2.5/eval_results/predictions_pretrained.txt ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_2.5/eval_results/summary.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ EVALUATION SUMMARY
2
+ =============================================================================
3
+
4
+ Checkpoint Exact Match Token Acc BLEU BPB ms/sample
5
+ -----------------------------------------------------------------------------
6
+ pretrained 0.00% 46.73% 1.11 1.8769 907.7
7
+ model_best 37.55% 47.98% 19.49 1.3283 188.8
8
+ model_final 35.65% 47.58% 18.41 1.4245 230.7
routing_rerun/N_2.5/train.log ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_2.5/wandb/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-23T17:28:20.034273503Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-05-23T17:28:20.495979525Z","level":"INFO","msg":"stream: created new stream","id":"hcp9axm4"}
3
+ {"time":"2026-05-23T17:28:20.496106936Z","level":"INFO","msg":"handler: started","stream_id":"hcp9axm4"}
4
+ {"time":"2026-05-23T17:28:20.496281243Z","level":"INFO","msg":"stream: started","id":"hcp9axm4"}
5
+ {"time":"2026-05-23T17:28:20.496287355Z","level":"INFO","msg":"writer: started","stream_id":"hcp9axm4"}
6
+ {"time":"2026-05-23T17:28:20.496309559Z","level":"INFO","msg":"sender: started","stream_id":"hcp9axm4"}
7
+ {"time":"2026-05-23T17:28:20.630740567Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-05-24T00:08:43.285793193Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-05-24T00:08:43.389899668Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
+ {"time":"2026-05-24T00:08:43.395126237Z","level":"INFO","msg":"stream: closing","id":"hcp9axm4"}
11
+ {"time":"2026-05-24T00:08:43.395149698Z","level":"INFO","msg":"handler: closed","stream_id":"hcp9axm4"}
12
+ {"time":"2026-05-24T00:08:43.395348295Z","level":"INFO","msg":"sender: closed","stream_id":"hcp9axm4"}
13
+ {"time":"2026-05-24T00:08:43.395364596Z","level":"INFO","msg":"stream: closed","id":"hcp9axm4"}
routing_rerun/N_2.5/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-23 17:28:19,720 INFO MainThread:15014 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_setup.py:_flush():81] Configure stats pid to 15014
3
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-23/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug.log
5
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-23/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug-internal.log
6
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'max_steps': None, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.5, 'load_balancing_N': 2.5, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': 150000, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': False, 'model_only_checkpoints': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'routing-evolution', 'run_name': 'routing_N2.5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-23/N_2.5'}, 'paths': {'output_dir': 'outputs/2026-05-23/N_2.5'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/routing_evolution_exp/train.py'}}
9
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:init():892] starting backend
10
+ 2026-05-23 17:28:20,009 INFO MainThread:15014 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-23 17:28:20,030 INFO MainThread:15014 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-23 17:28:20,037 INFO MainThread:15014 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-23 17:28:20,063 INFO MainThread:15014 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-23 17:28:20,628 INFO MainThread:15014 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-23 17:28:20,883 INFO MainThread:15014 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-24 00:08:42,341 INFO MainThread:15014 [wandb_run.py:_finish():2295] finishing run nikita/routing-evolution/hcp9axm4
21
+ 2026-05-24 00:08:42,342 INFO MainThread:15014 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-05-24 00:08:42,342 INFO MainThread:15014 [wandb_run.py:_restore():2476] restore
23
+ 2026-05-24 00:08:42,342 INFO MainThread:15014 [wandb_run.py:_restore():2482] restore done
24
+ 2026-05-24 00:08:43,394 INFO MainThread:15014 [wandb_run.py:_footer_sync_info():3870] logging synced files
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/code/routing_evolution_exp/train.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Routing Evolution Experiment: H-Net fine-tune с разными load_balancing_N.
3
+
4
+ Цель: изучить как routing module (W_q, W_k) эволюционирует при разном давлении сжатия.
5
+
6
+ Использование:
7
+ # Запуск с конкретным N
8
+ accelerate launch --config_file accelerate_config.yaml \
9
+ train.py training.load_balancing_N=6.0 tracking.run_name=routing_N6.0
10
+
11
+ # Запуск всех N через run_all.sh
12
+ bash run_all.sh
13
+ """
14
+
15
+ import math
16
+ import os
17
+ from pathlib import Path
18
+
19
+ import torch
20
+ import hydra
21
+ from hydra.core.hydra_config import HydraConfig
22
+ from omegaconf import DictConfig, OmegaConf
23
+ from accelerate import Accelerator
24
+ from accelerate.utils import set_seed as accelerate_set_seed
25
+
26
+ # HNet imports
27
+ from hnet.load_utils import load_from_pretrained, load_from_config
28
+ from hnet.utils.tokenizers import ByteTokenizer
29
+ from hnet.utils.train import group_params
30
+
31
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
32
+ import sys
33
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
34
+
35
+ # Shared training library
36
+ from training_lib.utils import log_message
37
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
38
+ from training_lib.schedulers import get_lr_scheduler
39
+ from training_lib.tracking import init_tracking, finish_tracking
40
+ from training_lib.hnet.train_loop import train_epoch
41
+ from training_lib.hnet.data import create_dataloaders
42
+
43
+
44
+ def save_routing_weights(model, step, output_dir):
45
+ """Сохраняет только routing W_q/W_k для быстрого анализа weight evolution."""
46
+ routing_dir = output_dir / "routing_weights"
47
+ routing_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ state = model.state_dict()
50
+ routing_keys = [k for k in state if "routing_module" in k and "proj_layer" in k]
51
+ routing_weights = {k: state[k].cpu() for k in routing_keys}
52
+ torch.save(routing_weights, routing_dir / f"routing_step_{step}.pt")
53
+
54
+
55
+ def on_checkpoint_callback(model, step, output_dir):
56
+ """Callback для train_epoch: сохраняет routing weights при каждом checkpoint.
57
+
58
+ Модель уже unwrapped в train_loop перед вызовом callback.
59
+ """
60
+ save_routing_weights(model, step, output_dir)
61
+
62
+
63
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
64
+ def main(cfg: DictConfig):
65
+ """Тренировка H-Net с фиксированным числом шагов для анализа routing evolution."""
66
+
67
+ # === Accelerator Setup ===
68
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
69
+
70
+ accelerator = Accelerator(
71
+ mixed_precision=mixed_precision,
72
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
73
+ )
74
+
75
+ accelerate_set_seed(cfg.seed)
76
+
77
+ if cfg.paths.output_dir is None:
78
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
79
+
80
+ OmegaConf.resolve(cfg)
81
+
82
+ log_message(
83
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
84
+ cfg,
85
+ accelerator,
86
+ )
87
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
88
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
89
+
90
+ log_message("=" * 60, cfg, accelerator)
91
+ log_message(
92
+ f"Routing Evolution Experiment | N={cfg.training.load_balancing_N}",
93
+ cfg,
94
+ accelerator,
95
+ )
96
+ log_message("=" * 60, cfg, accelerator)
97
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
98
+
99
+ # === Experiment Tracking Init ===
100
+ init_tracking(cfg, accelerator)
101
+
102
+ # === Tokenizer ===
103
+ tokenizer = ByteTokenizer()
104
+
105
+ # === Model ===
106
+ log_message("Loading model...", cfg, accelerator)
107
+ if cfg.model.checkpoint_path:
108
+ model = load_from_pretrained(
109
+ model_path=cfg.model.checkpoint_path,
110
+ model_config_path=cfg.model.config_path,
111
+ )
112
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
113
+ else:
114
+ model = load_from_config(
115
+ model_config_path=cfg.model.config_path,
116
+ device="cpu",
117
+ )
118
+ model.init_weights()
119
+ log_message("Initialized from scratch", cfg, accelerator)
120
+
121
+ model.train()
122
+
123
+ lr_multiplier = list(cfg.training.lr_multiplier)
124
+ model.apply_lr_multiplier(lr_multiplier)
125
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
126
+
127
+ if cfg.training.warmup_model:
128
+ log_message("Warming up model...", cfg, accelerator)
129
+ model = model.to(accelerator.device)
130
+ model.warmup(verbose=accelerator.is_main_process)
131
+
132
+ # Save initial routing weights (step 0)
133
+ if accelerator.is_main_process:
134
+ output_dir = Path(cfg.paths.output_dir)
135
+ save_routing_weights(model, 0, output_dir)
136
+
137
+ total_params = sum(p.numel() for p in model.parameters())
138
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
139
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
140
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
141
+
142
+ # === Data ===
143
+ log_message("Creating dataloaders...", cfg, accelerator)
144
+ dataloaders = create_dataloaders(cfg, tokenizer)
145
+
146
+ train_dataloader = dataloaders["train"]
147
+ val_dataloader = dataloaders.get("validation", None)
148
+
149
+ log_message(
150
+ f"Train dataset size: {len(train_dataloader.dataset)} "
151
+ f"(max_train_samples={cfg.data.max_train_samples}) | "
152
+ f"Epochs: {cfg.training.epochs}",
153
+ cfg,
154
+ accelerator,
155
+ )
156
+
157
+ # === Optimizer ===
158
+ param_groups = group_params(model)
159
+
160
+ for group in param_groups:
161
+ if "lr" not in group:
162
+ group["lr"] = cfg.training.lr
163
+ else:
164
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
165
+ if "weight_decay" not in group:
166
+ group["weight_decay"] = cfg.training.weight_decay
167
+
168
+ optimizer = torch.optim.AdamW(
169
+ param_groups,
170
+ lr=cfg.training.lr,
171
+ betas=tuple(cfg.training.betas),
172
+ eps=cfg.training.eps,
173
+ )
174
+
175
+ # === Scheduler ===
176
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
177
+ max_steps = (
178
+ cfg.training.epochs
179
+ * steps_per_epoch
180
+ // cfg.training.gradient_accumulation_steps
181
+ )
182
+ scheduler = get_lr_scheduler(optimizer, cfg, max_steps)
183
+
184
+ log_message(
185
+ f"Max steps: {max_steps}, Steps per epoch: {steps_per_epoch}",
186
+ cfg,
187
+ accelerator,
188
+ )
189
+
190
+ # === Accelerate Prepare ===
191
+ if val_dataloader is not None:
192
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
193
+ accelerator.prepare(
194
+ model, optimizer, train_dataloader, val_dataloader, scheduler
195
+ )
196
+ )
197
+ else:
198
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
199
+ model, optimizer, train_dataloader, scheduler
200
+ )
201
+
202
+ # === Training Loop ===
203
+ log_message("Starting training...", cfg, accelerator)
204
+
205
+ global_step = 0
206
+ best_val_loss = float("inf")
207
+ epoch = 0
208
+
209
+ num_epochs = cfg.training.epochs
210
+
211
+ try:
212
+ for epoch in range(1, num_epochs + 1):
213
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
214
+ log_message(
215
+ f"EPOCH {epoch}/{num_epochs} (step {global_step})", cfg, accelerator
216
+ )
217
+ log_message(f"{'=' * 60}", cfg, accelerator)
218
+
219
+ global_step, best_val_loss = train_epoch(
220
+ model=model,
221
+ dataloader=train_dataloader,
222
+ optimizer=optimizer,
223
+ scheduler=scheduler,
224
+ cfg=cfg,
225
+ epoch=epoch,
226
+ global_step=global_step,
227
+ accelerator=accelerator,
228
+ val_dataloader=val_dataloader,
229
+ best_val_loss=best_val_loss,
230
+ max_steps=max_steps,
231
+ on_checkpoint=on_checkpoint_callback,
232
+ )
233
+
234
+ except KeyboardInterrupt:
235
+ log_message("Training interrupted by user", cfg, accelerator)
236
+ save_checkpoint(
237
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator,
238
+ model_only=True,
239
+ )
240
+
241
+ # === Final Save ===
242
+ log_message("\nTraining completed!", cfg, accelerator)
243
+
244
+ if accelerator.is_main_process:
245
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
246
+ unwrapped_model = accelerator.unwrap_model(model)
247
+ torch.save(unwrapped_model.state_dict(), final_model_path)
248
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
249
+
250
+ # Save final routing weights
251
+ save_routing_weights(unwrapped_model, global_step, Path(cfg.paths.output_dir))
252
+
253
+ accelerator.wait_for_everyone()
254
+ accelerator.end_training()
255
+ finish_tracking()
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/config.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.0
4
+ code_path: code/routing_evolution_exp/train.py
5
+ e:
6
+ xele1blpla9o3dqe7kdco4df3hrq09ni:
7
+ args:
8
+ - training.load_balancing_N=2.5
9
+ - tracking.run_name=routing_N2.5
10
+ codePath: routing_evolution_exp/train.py
11
+ codePathLocal: train.py
12
+ cpu_count: 36
13
+ cpu_count_logical: 72
14
+ cudaVersion: "13.2"
15
+ disk:
16
+ /:
17
+ total: "234075717632"
18
+ used: "19049525248"
19
+ email: nikita@local.ru
20
+ executable: /venv/bytellm/bin/python
21
+ git:
22
+ commit: 15d97dd48d00266a0b8f634920ab99cb7547da91
23
+ remote: https://github.com/naryst/byte-llms-code.git
24
+ gpu: NVIDIA A100 80GB PCIe
25
+ gpu_count: 1
26
+ gpu_nvidia:
27
+ - architecture: Ampere
28
+ cudaCores: 6912
29
+ memoryTotal: "85899345920"
30
+ name: NVIDIA A100 80GB PCIe
31
+ uuid: GPU-ec9764c4-224a-9508-002f-de7762d78498
32
+ host: 1a023b226280
33
+ memory:
34
+ total: "404274495488"
35
+ os: Linux-6.8.0-117-generic-x86_64-with-glibc2.39
36
+ program: /workspace/byte-llms-code/routing_evolution_exp/train.py
37
+ python: CPython 3.12.0
38
+ root: outputs/2026-05-23/N_2.5
39
+ startedAt: "2026-05-23T17:26:58.115428Z"
40
+ writerId: xele1blpla9o3dqe7kdco4df3hrq09ni
41
+ m: []
42
+ python_version: 3.12.0
43
+ t:
44
+ "1":
45
+ - 1
46
+ - 11
47
+ - 49
48
+ - 50
49
+ - 51
50
+ - 71
51
+ - 105
52
+ "2":
53
+ - 1
54
+ - 11
55
+ - 49
56
+ - 50
57
+ - 51
58
+ - 71
59
+ - 105
60
+ "3":
61
+ - 13
62
+ - 16
63
+ "4": 3.12.0
64
+ "5": 0.24.0
65
+ "6": 4.57.6
66
+ "12": 0.24.0
67
+ "13": linux-x86_64
68
+ data:
69
+ value:
70
+ max_context_len: 4096
71
+ max_target_len: 256
72
+ max_train_samples: 150000
73
+ max_val_samples: null
74
+ num_workers: 0
75
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
76
+ pin_memory: true
77
+ device:
78
+ value: cuda
79
+ logging:
80
+ value:
81
+ eval_interval: 1000
82
+ log_interval: 10
83
+ model_only_checkpoints: true
84
+ save_every_epoch: false
85
+ save_interval: 3000
86
+ model:
87
+ value:
88
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
89
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
90
+ paths:
91
+ value:
92
+ output_dir: outputs/2026-05-23/N_2.5
93
+ seed:
94
+ value: 42
95
+ tracking:
96
+ value:
97
+ backend: wandb
98
+ base_url: https://wandb.platun0v.ru
99
+ enabled: true
100
+ entity: null
101
+ local_dir: outputs/2026-05-23/N_2.5
102
+ project: routing-evolution
103
+ run_name: routing_N2.5
104
+ training:
105
+ value:
106
+ batch_size: 8
107
+ betas:
108
+ - 0.9
109
+ - 0.95
110
+ decay_ratio: 0.2
111
+ epochs: 3
112
+ eps: 1e-08
113
+ eval_batch_size: 24
114
+ gradient_accumulation_steps: 4
115
+ load_balancing_N: 2.5
116
+ load_balancing_weight: 0.5
117
+ lr: 0.0001
118
+ lr_multiplier:
119
+ - 2
120
+ - 1.5
121
+ - 1
122
+ lr_scheduler: wsd
123
+ max_grad_norm: 1
124
+ max_steps: null
125
+ min_lr_ratio: 0.1
126
+ resume: false
127
+ resume_checkpoint: null
128
+ use_amp: true
129
+ warmup_model: true
130
+ warmup_ratio: 0.1
131
+ warmup_steps: 100
132
+ weight_decay: 0.1
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/output.log ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-05-23 17:26:59] Loading model...
2
+ [2026-05-23 17:27:05] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
3
+ [2026-05-23 17:27:05] Applied LR multipliers: [2.0, 1.5, 1.0]
4
+ [2026-05-23 17:27:05] Warming up model...
5
+ [WARMUP] Starting warmup (compiling Triton kernels)...
6
+ Traceback (most recent call last):
7
+ File "/workspace/byte-llms-code/routing_evolution_exp/train.py", line 259, in <module>
8
+ main()
9
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/main.py", line 94, in decorated_main
10
+ _run_hydra(
11
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
12
+ _run_app(
13
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/_internal/utils.py", line 457, in _run_app
14
+ run_and_report(
15
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
16
+ return func()
17
+ ^^^^^^
18
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
19
+ lambda: hydra.run(
20
+ ^^^^^^^^^^
21
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/_internal/hydra.py", line 119, in run
22
+ ret = run_job(
23
+ ^^^^^^^^
24
+ File "/venv/bytellm/lib/python3.12/site-packages/hydra/core/utils.py", line 186, in run_job
25
+ ret.return_value = task_function(task_cfg)
26
+ ^^^^^^^^^^^^^^^^^^^^^^^
27
+ File "/workspace/byte-llms-code/routing_evolution_exp/train.py", line 130, in main
28
+ model.warmup(verbose=accelerator.is_main_process)
29
+ File "/workspace/byte-llms-code/hnet_project/hnet/models/mixer_seq.py", line 126, in warmup
30
+ output = self.forward(dummy_input)
31
+ ^^^^^^^^^^^^^^^^^^^^^^^^^
32
+ File "/workspace/byte-llms-code/hnet_project/hnet/models/mixer_seq.py", line 181, in forward
33
+ hidden_states, bpred_output = self.backbone(
34
+ ^^^^^^^^^^^^^^
35
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
36
+ return self._call_impl(*args, **kwargs)
37
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
38
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
39
+ return forward_call(*args, **kwargs)
40
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
41
+ File "/workspace/byte-llms-code/hnet_project/hnet/models/hnet.py", line 333, in forward
42
+ hidden_states = self.encoder(
43
+ ^^^^^^^^^^^^^
44
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
45
+ return self._call_impl(*args, **kwargs)
46
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
47
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
48
+ return forward_call(*args, **kwargs)
49
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
50
+ File "/workspace/byte-llms-code/hnet_project/hnet/modules/isotropic.py", line 180, in forward
51
+ hidden_states, residual = layer(
52
+ ^^^^^^
53
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
54
+ return self._call_impl(*args, **kwargs)
55
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
56
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
57
+ return forward_call(*args, **kwargs)
58
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
59
+ File "/workspace/byte-llms-code/hnet_project/hnet/modules/block.py", line 226, in forward
60
+ hidden_states = self.mixer(
61
+ ^^^^^^^^^^^
62
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
63
+ return self._call_impl(*args, **kwargs)
64
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
65
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
66
+ return forward_call(*args, **kwargs)
67
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
68
+ File "/venv/bytellm/lib/python3.12/site-packages/mamba_ssm/modules/mamba2.py", line 185, in forward
69
+ out = mamba_split_conv1d_scan_combined(
70
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
71
+ File "/venv/bytellm/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 930, in mamba_split_conv1d_scan_combined
72
+ return MambaSplitConv1dScanCombinedFn.apply(zxbcdt, conv1d_weight, conv1d_bias, dt_bias, A, D, chunk_size, initial_states, seq_idx, dt_limit, return_final_states, activation, rmsnorm_weight, rmsnorm_eps, outproj_weight, outproj_bias, headdim, ngroups, norm_before_gate)
73
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
74
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/autograd/function.py", line 575, in apply
75
+ return super().apply(*args, **kwargs) # type: ignore[misc]
76
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
77
+ File "/venv/bytellm/lib/python3.12/site-packages/torch/amp/autocast_mode.py", line 503, in decorate_fwd
78
+ return fwd(*args, **kwargs)
79
+ ^^^^^^^^^^^^^^^^^^^^
80
+ File "/venv/bytellm/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 795, in forward
81
+ out_x, _, dt_out, dA_cumsum, states, final_states = _mamba_chunk_scan_combined_fwd(x, dt, A, B, C, chunk_size=chunk_size, D=D, z=None, dt_bias=dt_bias, initial_states=initial_states, seq_idx=seq_idx, dt_softplus=True, dt_limit=dt_limit)
82
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
83
+ File "/venv/bytellm/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_combined.py", line 324, in _mamba_chunk_scan_combined_fwd
84
+ out, out_x = _chunk_scan_fwd(CB, x, dt, dA_cumsum, C, states, D=D, z=z, seq_idx=seq_idx)
85
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
86
+ File "/venv/bytellm/lib/python3.12/site-packages/mamba_ssm/ops/triton/ssd_chunk_scan.py", line 1257, in _chunk_scan_fwd
87
+ _chunk_scan_fwd_kernel[grid](
88
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/runtime/jit.py", line 330, in <lambda>
89
+ return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
90
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
91
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 186, in run
92
+ timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
93
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
94
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 166, in _bench
95
+ return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
96
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
97
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/testing.py", line 140, in do_bench
98
+ fn()
99
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/runtime/autotuner.py", line 152, in kernel_call
100
+ self.fn.run(
101
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/runtime/jit.py", line 580, in run
102
+ bound_args, sig_and_spec, constexpr_vals, non_constexpr_vals, excess_kwargs = self.binder(*args, **kwargs)
103
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
104
+ File "<string>", line 2, in dynamic_func
105
+ File "/venv/bytellm/lib/python3.12/site-packages/triton/runtime/jit.py", line 294, in mangle_type
106
+ def mangle_type(arg, is_const=False):
107
+
108
+ KeyboardInterrupt
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/wandb-metadata.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-117-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.0",
4
+ "startedAt": "2026-05-23T17:26:58.115428Z",
5
+ "args": [
6
+ "training.load_balancing_N=2.5",
7
+ "tracking.run_name=routing_N2.5"
8
+ ],
9
+ "program": "/workspace/byte-llms-code/routing_evolution_exp/train.py",
10
+ "codePath": "routing_evolution_exp/train.py",
11
+ "codePathLocal": "train.py",
12
+ "git": {
13
+ "remote": "https://github.com/naryst/byte-llms-code.git",
14
+ "commit": "15d97dd48d00266a0b8f634920ab99cb7547da91"
15
+ },
16
+ "email": "nikita@local.ru",
17
+ "root": "outputs/2026-05-23/N_2.5",
18
+ "host": "1a023b226280",
19
+ "executable": "/venv/bytellm/bin/python",
20
+ "cpu_count": 36,
21
+ "cpu_count_logical": 72,
22
+ "gpu": "NVIDIA A100 80GB PCIe",
23
+ "gpu_count": 1,
24
+ "disk": {
25
+ "/": {
26
+ "total": "234075717632",
27
+ "used": "19049525248"
28
+ }
29
+ },
30
+ "memory": {
31
+ "total": "404274495488"
32
+ },
33
+ "gpu_nvidia": [
34
+ {
35
+ "name": "NVIDIA A100 80GB PCIe",
36
+ "memoryTotal": "85899345920",
37
+ "cudaCores": 6912,
38
+ "architecture": "Ampere",
39
+ "uuid": "GPU-ec9764c4-224a-9508-002f-de7762d78498"
40
+ }
41
+ ],
42
+ "cudaVersion": "13.2",
43
+ "writerId": "xele1blpla9o3dqe7kdco4df3hrq09ni"
44
+ }
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":43},"_runtime":43}
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-23T17:26:58.232625454Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpf4ecvb2u/port-14607.txt","pid":14607,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-05-23T17:26:58.233976439Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":14607}
3
+ {"time":"2026-05-23T17:26:58.233282033Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-14607-14617-1099789617/socket","Net":"unix"}}
4
+ {"time":"2026-05-23T17:26:58.419094136Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-05-23T17:26:58.44527049Z","level":"INFO","msg":"handleInformInit: received","streamId":"l6q9x03b","id":"1(@)"}
6
+ {"time":"2026-05-23T17:26:58.885396624Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"l6q9x03b","id":"1(@)"}
7
+ {"time":"2026-05-23T17:27:42.947784883Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2026-05-23T17:27:42.947914409Z","level":"INFO","msg":"server is shutting down"}
9
+ {"time":"2026-05-23T17:27:42.947902597Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
10
+ {"time":"2026-05-23T17:27:42.948232086Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2026-05-23T17:27:42.948255177Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-14607-14617-1099789617/socket","Net":"unix"}}
12
+ {"time":"2026-05-23T17:27:43.504784979Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2026-05-23T17:27:43.504842499Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2026-05-23T17:27:43.504876254Z","level":"INFO","msg":"server is closed"}
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug-internal.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-23T17:26:58.445481693Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-05-23T17:26:58.885208159Z","level":"INFO","msg":"stream: created new stream","id":"l6q9x03b"}
3
+ {"time":"2026-05-23T17:26:58.885289947Z","level":"INFO","msg":"handler: started","stream_id":"l6q9x03b"}
4
+ {"time":"2026-05-23T17:26:58.885385406Z","level":"INFO","msg":"stream: started","id":"l6q9x03b"}
5
+ {"time":"2026-05-23T17:26:58.885419994Z","level":"INFO","msg":"writer: started","stream_id":"l6q9x03b"}
6
+ {"time":"2026-05-23T17:26:58.88542392Z","level":"INFO","msg":"sender: started","stream_id":"l6q9x03b"}
7
+ {"time":"2026-05-23T17:26:59.024082436Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-05-23T17:27:42.946648107Z","level":"INFO","msg":"flowcontrol: backed up, offloading to disk","recordNumber":68}
9
+ {"time":"2026-05-23T17:27:42.947906186Z","level":"INFO","msg":"stream: closing","id":"l6q9x03b"}
10
+ {"time":"2026-05-23T17:27:42.955515536Z","level":"INFO","msg":"flowcontrol: unblocked","totalOffloaded":34}
11
+ {"time":"2026-05-23T17:27:43.302861641Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
12
+ {"time":"2026-05-23T17:27:43.500084989Z","level":"INFO","msg":"handler: closed","stream_id":"l6q9x03b"}
13
+ {"time":"2026-05-23T17:27:43.500251865Z","level":"INFO","msg":"sender: closed","stream_id":"l6q9x03b"}
14
+ {"time":"2026-05-23T17:27:43.500269359Z","level":"INFO","msg":"stream: closed","id":"l6q9x03b"}
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug.log ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-23 17:26:58,117 INFO MainThread:14607 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-23 17:26:58,117 INFO MainThread:14607 [wandb_setup.py:_flush():81] Configure stats pid to 14607
3
+ 2026-05-23 17:26:58,117 INFO MainThread:14607 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-23 17:26:58,117 INFO MainThread:14607 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-23/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug.log
5
+ 2026-05-23 17:26:58,117 INFO MainThread:14607 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-23/N_2.5/wandb/run-20260523_172658-l6q9x03b/logs/debug-internal.log
6
+ 2026-05-23 17:26:58,117 INFO MainThread:14607 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-23 17:26:58,118 INFO MainThread:14607 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'max_steps': None, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.5, 'load_balancing_N': 2.5, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': 150000, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': False, 'model_only_checkpoints': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'routing-evolution', 'run_name': 'routing_N2.5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-23/N_2.5'}, 'paths': {'output_dir': 'outputs/2026-05-23/N_2.5'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/routing_evolution_exp/train.py'}}
9
+ 2026-05-23 17:26:58,118 INFO MainThread:14607 [wandb_init.py:init():892] starting backend
10
+ 2026-05-23 17:26:58,419 INFO MainThread:14607 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-23 17:26:58,441 INFO MainThread:14607 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-23 17:26:58,447 INFO MainThread:14607 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-23 17:26:58,471 INFO MainThread:14607 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-23 17:26:59,022 INFO MainThread:14607 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-23 17:26:59,270 INFO MainThread:14607 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-23 17:26:59,270 INFO MainThread:14607 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-23 17:26:59,271 INFO MainThread:14607 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-23 17:26:59,271 INFO MainThread:14607 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-23 17:26:59,274 INFO MainThread:14607 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-23 17:27:42,948 INFO wandb-AsyncioManager-main:14607 [service_client.py:_forward_responses():80] Reached EOF.
21
+ 2026-05-23 17:27:42,949 INFO wandb-AsyncioManager-main:14607 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
routing_rerun/N_2.5/wandb/run-20260523_172658-l6q9x03b/run-l6q9x03b.wandb ADDED
Binary file (17 kB). View file
 
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/code/routing_evolution_exp/train.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Routing Evolution Experiment: H-Net fine-tune с разными load_balancing_N.
3
+
4
+ Цель: изучить как routing module (W_q, W_k) эволюционирует при разном давлении сжатия.
5
+
6
+ Использование:
7
+ # Запуск с конкретным N
8
+ accelerate launch --config_file accelerate_config.yaml \
9
+ train.py training.load_balancing_N=6.0 tracking.run_name=routing_N6.0
10
+
11
+ # Запуск всех N через run_all.sh
12
+ bash run_all.sh
13
+ """
14
+
15
+ import math
16
+ import os
17
+ from pathlib import Path
18
+
19
+ import torch
20
+ import hydra
21
+ from hydra.core.hydra_config import HydraConfig
22
+ from omegaconf import DictConfig, OmegaConf
23
+ from accelerate import Accelerator
24
+ from accelerate.utils import set_seed as accelerate_set_seed
25
+
26
+ # HNet imports
27
+ from hnet.load_utils import load_from_pretrained, load_from_config
28
+ from hnet.utils.tokenizers import ByteTokenizer
29
+ from hnet.utils.train import group_params
30
+
31
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
32
+ import sys
33
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
34
+
35
+ # Shared training library
36
+ from training_lib.utils import log_message
37
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
38
+ from training_lib.schedulers import get_lr_scheduler
39
+ from training_lib.tracking import init_tracking, finish_tracking
40
+ from training_lib.hnet.train_loop import train_epoch
41
+ from training_lib.hnet.data import create_dataloaders
42
+
43
+
44
+ def save_routing_weights(model, step, output_dir):
45
+ """Сохраняет только routing W_q/W_k для быстрого анализа weight evolution."""
46
+ routing_dir = output_dir / "routing_weights"
47
+ routing_dir.mkdir(parents=True, exist_ok=True)
48
+
49
+ state = model.state_dict()
50
+ routing_keys = [k for k in state if "routing_module" in k and "proj_layer" in k]
51
+ routing_weights = {k: state[k].cpu() for k in routing_keys}
52
+ torch.save(routing_weights, routing_dir / f"routing_step_{step}.pt")
53
+
54
+
55
+ def on_checkpoint_callback(model, step, output_dir):
56
+ """Callback для train_epoch: сохраняет routing weights при каждом checkpoint.
57
+
58
+ Модель уже unwrapped в train_loop перед вызовом callback.
59
+ """
60
+ save_routing_weights(model, step, output_dir)
61
+
62
+
63
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
64
+ def main(cfg: DictConfig):
65
+ """Тренировка H-Net с фиксированным числом шагов для анализа routing evolution."""
66
+
67
+ # === Accelerator Setup ===
68
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
69
+
70
+ accelerator = Accelerator(
71
+ mixed_precision=mixed_precision,
72
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
73
+ )
74
+
75
+ accelerate_set_seed(cfg.seed)
76
+
77
+ if cfg.paths.output_dir is None:
78
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
79
+
80
+ OmegaConf.resolve(cfg)
81
+
82
+ log_message(
83
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
84
+ cfg,
85
+ accelerator,
86
+ )
87
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
88
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
89
+
90
+ log_message("=" * 60, cfg, accelerator)
91
+ log_message(
92
+ f"Routing Evolution Experiment | N={cfg.training.load_balancing_N}",
93
+ cfg,
94
+ accelerator,
95
+ )
96
+ log_message("=" * 60, cfg, accelerator)
97
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
98
+
99
+ # === Experiment Tracking Init ===
100
+ init_tracking(cfg, accelerator)
101
+
102
+ # === Tokenizer ===
103
+ tokenizer = ByteTokenizer()
104
+
105
+ # === Model ===
106
+ log_message("Loading model...", cfg, accelerator)
107
+ if cfg.model.checkpoint_path:
108
+ model = load_from_pretrained(
109
+ model_path=cfg.model.checkpoint_path,
110
+ model_config_path=cfg.model.config_path,
111
+ )
112
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
113
+ else:
114
+ model = load_from_config(
115
+ model_config_path=cfg.model.config_path,
116
+ device="cpu",
117
+ )
118
+ model.init_weights()
119
+ log_message("Initialized from scratch", cfg, accelerator)
120
+
121
+ model.train()
122
+
123
+ lr_multiplier = list(cfg.training.lr_multiplier)
124
+ model.apply_lr_multiplier(lr_multiplier)
125
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
126
+
127
+ if cfg.training.warmup_model:
128
+ log_message("Warming up model...", cfg, accelerator)
129
+ model = model.to(accelerator.device)
130
+ model.warmup(verbose=accelerator.is_main_process)
131
+
132
+ # Save initial routing weights (step 0)
133
+ if accelerator.is_main_process:
134
+ output_dir = Path(cfg.paths.output_dir)
135
+ save_routing_weights(model, 0, output_dir)
136
+
137
+ total_params = sum(p.numel() for p in model.parameters())
138
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
139
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
140
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
141
+
142
+ # === Data ===
143
+ log_message("Creating dataloaders...", cfg, accelerator)
144
+ dataloaders = create_dataloaders(cfg, tokenizer)
145
+
146
+ train_dataloader = dataloaders["train"]
147
+ val_dataloader = dataloaders.get("validation", None)
148
+
149
+ log_message(
150
+ f"Train dataset size: {len(train_dataloader.dataset)} "
151
+ f"(max_train_samples={cfg.data.max_train_samples}) | "
152
+ f"Epochs: {cfg.training.epochs}",
153
+ cfg,
154
+ accelerator,
155
+ )
156
+
157
+ # === Optimizer ===
158
+ param_groups = group_params(model)
159
+
160
+ for group in param_groups:
161
+ if "lr" not in group:
162
+ group["lr"] = cfg.training.lr
163
+ else:
164
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
165
+ if "weight_decay" not in group:
166
+ group["weight_decay"] = cfg.training.weight_decay
167
+
168
+ optimizer = torch.optim.AdamW(
169
+ param_groups,
170
+ lr=cfg.training.lr,
171
+ betas=tuple(cfg.training.betas),
172
+ eps=cfg.training.eps,
173
+ )
174
+
175
+ # === Scheduler ===
176
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
177
+ max_steps = (
178
+ cfg.training.epochs
179
+ * steps_per_epoch
180
+ // cfg.training.gradient_accumulation_steps
181
+ )
182
+ scheduler = get_lr_scheduler(optimizer, cfg, max_steps)
183
+
184
+ log_message(
185
+ f"Max steps: {max_steps}, Steps per epoch: {steps_per_epoch}",
186
+ cfg,
187
+ accelerator,
188
+ )
189
+
190
+ # === Accelerate Prepare ===
191
+ if val_dataloader is not None:
192
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
193
+ accelerator.prepare(
194
+ model, optimizer, train_dataloader, val_dataloader, scheduler
195
+ )
196
+ )
197
+ else:
198
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
199
+ model, optimizer, train_dataloader, scheduler
200
+ )
201
+
202
+ # === Training Loop ===
203
+ log_message("Starting training...", cfg, accelerator)
204
+
205
+ global_step = 0
206
+ best_val_loss = float("inf")
207
+ epoch = 0
208
+
209
+ num_epochs = cfg.training.epochs
210
+
211
+ try:
212
+ for epoch in range(1, num_epochs + 1):
213
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
214
+ log_message(
215
+ f"EPOCH {epoch}/{num_epochs} (step {global_step})", cfg, accelerator
216
+ )
217
+ log_message(f"{'=' * 60}", cfg, accelerator)
218
+
219
+ global_step, best_val_loss = train_epoch(
220
+ model=model,
221
+ dataloader=train_dataloader,
222
+ optimizer=optimizer,
223
+ scheduler=scheduler,
224
+ cfg=cfg,
225
+ epoch=epoch,
226
+ global_step=global_step,
227
+ accelerator=accelerator,
228
+ val_dataloader=val_dataloader,
229
+ best_val_loss=best_val_loss,
230
+ max_steps=max_steps,
231
+ on_checkpoint=on_checkpoint_callback,
232
+ )
233
+
234
+ except KeyboardInterrupt:
235
+ log_message("Training interrupted by user", cfg, accelerator)
236
+ save_checkpoint(
237
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator,
238
+ model_only=True,
239
+ )
240
+
241
+ # === Final Save ===
242
+ log_message("\nTraining completed!", cfg, accelerator)
243
+
244
+ if accelerator.is_main_process:
245
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
246
+ unwrapped_model = accelerator.unwrap_model(model)
247
+ torch.save(unwrapped_model.state_dict(), final_model_path)
248
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
249
+
250
+ # Save final routing weights
251
+ save_routing_weights(unwrapped_model, global_step, Path(cfg.paths.output_dir))
252
+
253
+ accelerator.wait_for_everyone()
254
+ accelerator.end_training()
255
+ finish_tracking()
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.0
4
+ code_path: code/routing_evolution_exp/train.py
5
+ e:
6
+ bnp9f95qneklhh3kico6ks4t4aw6iad3:
7
+ args:
8
+ - training.load_balancing_N=2.5
9
+ - tracking.run_name=routing_N2.5
10
+ codePath: routing_evolution_exp/train.py
11
+ codePathLocal: train.py
12
+ cpu_count: 36
13
+ cpu_count_logical: 72
14
+ cudaVersion: "13.2"
15
+ disk:
16
+ /:
17
+ total: "234075717632"
18
+ used: "19065532416"
19
+ email: nikita@local.ru
20
+ executable: /venv/bytellm/bin/python
21
+ git:
22
+ commit: 15d97dd48d00266a0b8f634920ab99cb7547da91
23
+ remote: https://github.com/naryst/byte-llms-code.git
24
+ gpu: NVIDIA A100 80GB PCIe
25
+ gpu_count: 1
26
+ gpu_nvidia:
27
+ - architecture: Ampere
28
+ cudaCores: 6912
29
+ memoryTotal: "85899345920"
30
+ name: NVIDIA A100 80GB PCIe
31
+ uuid: GPU-ec9764c4-224a-9508-002f-de7762d78498
32
+ host: 1a023b226280
33
+ memory:
34
+ total: "404274495488"
35
+ os: Linux-6.8.0-117-generic-x86_64-with-glibc2.39
36
+ program: /workspace/byte-llms-code/routing_evolution_exp/train.py
37
+ python: CPython 3.12.0
38
+ root: outputs/2026-05-23/N_2.5
39
+ startedAt: "2026-05-23T17:28:19.719235Z"
40
+ writerId: bnp9f95qneklhh3kico6ks4t4aw6iad3
41
+ m: []
42
+ python_version: 3.12.0
43
+ t:
44
+ "1":
45
+ - 1
46
+ - 11
47
+ - 49
48
+ - 50
49
+ - 51
50
+ - 71
51
+ - 105
52
+ "2":
53
+ - 1
54
+ - 11
55
+ - 49
56
+ - 50
57
+ - 51
58
+ - 71
59
+ - 105
60
+ "3":
61
+ - 2
62
+ - 13
63
+ - 16
64
+ - 61
65
+ "4": 3.12.0
66
+ "5": 0.24.0
67
+ "6": 4.57.6
68
+ "12": 0.24.0
69
+ "13": linux-x86_64
70
+ data:
71
+ value:
72
+ max_context_len: 4096
73
+ max_target_len: 256
74
+ max_train_samples: 150000
75
+ max_val_samples: null
76
+ num_workers: 0
77
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
78
+ pin_memory: true
79
+ device:
80
+ value: cuda
81
+ logging:
82
+ value:
83
+ eval_interval: 1000
84
+ log_interval: 10
85
+ model_only_checkpoints: true
86
+ save_every_epoch: false
87
+ save_interval: 3000
88
+ model:
89
+ value:
90
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
91
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
92
+ paths:
93
+ value:
94
+ output_dir: outputs/2026-05-23/N_2.5
95
+ seed:
96
+ value: 42
97
+ tracking:
98
+ value:
99
+ backend: wandb
100
+ base_url: https://wandb.platun0v.ru
101
+ enabled: true
102
+ entity: null
103
+ local_dir: outputs/2026-05-23/N_2.5
104
+ project: routing-evolution
105
+ run_name: routing_N2.5
106
+ training:
107
+ value:
108
+ batch_size: 8
109
+ betas:
110
+ - 0.9
111
+ - 0.95
112
+ decay_ratio: 0.2
113
+ epochs: 3
114
+ eps: 1e-08
115
+ eval_batch_size: 24
116
+ gradient_accumulation_steps: 4
117
+ load_balancing_N: 2.5
118
+ load_balancing_weight: 0.5
119
+ lr: 0.0001
120
+ lr_multiplier:
121
+ - 2
122
+ - 1.5
123
+ - 1
124
+ lr_scheduler: wsd
125
+ max_grad_norm: 1
126
+ max_steps: null
127
+ min_lr_ratio: 0.1
128
+ resume: false
129
+ resume_checkpoint: null
130
+ use_amp: true
131
+ warmup_model: true
132
+ warmup_ratio: 0.1
133
+ warmup_steps: 100
134
+ weight_decay: 0.1
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/wandb-metadata.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-117-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.0",
4
+ "startedAt": "2026-05-23T17:28:19.719235Z",
5
+ "args": [
6
+ "training.load_balancing_N=2.5",
7
+ "tracking.run_name=routing_N2.5"
8
+ ],
9
+ "program": "/workspace/byte-llms-code/routing_evolution_exp/train.py",
10
+ "codePath": "routing_evolution_exp/train.py",
11
+ "codePathLocal": "train.py",
12
+ "git": {
13
+ "remote": "https://github.com/naryst/byte-llms-code.git",
14
+ "commit": "15d97dd48d00266a0b8f634920ab99cb7547da91"
15
+ },
16
+ "email": "nikita@local.ru",
17
+ "root": "outputs/2026-05-23/N_2.5",
18
+ "host": "1a023b226280",
19
+ "executable": "/venv/bytellm/bin/python",
20
+ "cpu_count": 36,
21
+ "cpu_count_logical": 72,
22
+ "gpu": "NVIDIA A100 80GB PCIe",
23
+ "gpu_count": 1,
24
+ "disk": {
25
+ "/": {
26
+ "total": "234075717632",
27
+ "used": "19065532416"
28
+ }
29
+ },
30
+ "memory": {
31
+ "total": "404274495488"
32
+ },
33
+ "gpu_nvidia": [
34
+ {
35
+ "name": "NVIDIA A100 80GB PCIe",
36
+ "memoryTotal": "85899345920",
37
+ "cudaCores": 6912,
38
+ "architecture": "Ampere",
39
+ "uuid": "GPU-ec9764c4-224a-9508-002f-de7762d78498"
40
+ }
41
+ ],
42
+ "cudaVersion": "13.2",
43
+ "writerId": "bnp9f95qneklhh3kico6ks4t4aw6iad3"
44
+ }
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime":24021,"epoch/soft_boundary_ratio_stage1":0.39492675367323404,"val/perplexity":1.4265099806664625,"epoch/time":8044.9603972435,"_timestamp":1.7795813175301356e+09,"train/epoch":3,"val/loss":0.8547116691308508,"_step":14060,"best/val_perplexity":1.3300726554660505,"train/loss":0.678175687789917,"best/val_loss":0.7847824677543916,"train/soft_boundary_ratio_stage1":0.3949258402291829,"train/chunk_len_stage1":2.4967957973233275,"epoch/loss":0.62571974257963,"epoch/hard_boundary_ratio_stage1":0.4015104655865258,"train/hard_boundary_ratio_stage0":0.4012647159477883,"epoch/lm_loss":0.12546251496211033,"epoch/lb_loss":1.0011933930957395,"train/step_time":1.351996898651123,"train/lb_loss":0.999517560005188,"epoch/hard_boundary_ratio_stage0":0.4012691360589079,"train/hard_boundary_ratio_stage1":0.40151479100268767,"train/loss_avg":0.6257273778828147,"val/time":442.93458580970764,"epoch/chunk_len_stage0":2.4979428233741783,"train/soft_boundary_ratio_stage0":0.43987519941498743,"epoch/soft_boundary_ratio_stage0":0.4398768389750085,"train/lr":1.0000112334056358e-05,"train/chunk_len_stage0":2.497970175538727,"best/step":4000,"epoch/chunk_len_stage1":2.496822725646825,"val/lb_loss":1.0005453147061618,"_wandb":{"runtime":24021},"val/lm_loss":0.3544390111086654,"train/lm_loss":0.18418139219284058}
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-23T17:28:19.822583715Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmphin61grg/port-15014.txt","pid":15014,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-05-23T17:28:19.823218456Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":15014}
3
+ {"time":"2026-05-23T17:28:19.823201848Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-15014-15024-2200562283/socket","Net":"unix"}}
4
+ {"time":"2026-05-23T17:28:20.009397582Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-05-23T17:28:20.034058805Z","level":"INFO","msg":"handleInformInit: received","streamId":"hcp9axm4","id":"1(@)"}
6
+ {"time":"2026-05-23T17:28:20.496307068Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"hcp9axm4","id":"1(@)"}
7
+ {"time":"2026-05-24T00:08:43.395067065Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"hcp9axm4","id":"1(@)"}
8
+ {"time":"2026-05-24T00:08:43.401754007Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"hcp9axm4","id":"1(@)"}
9
+ {"time":"2026-05-24T00:08:43.413724477Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2026-05-24T00:08:43.413783027Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2026-05-24T00:08:43.413805752Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2026-05-24T00:08:43.413800194Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2026-05-24T00:08:43.413876609Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2026-05-24T00:08:43.413890255Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2026-05-24T00:08:43.413946146Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-15014-15024-2200562283/socket","Net":"unix"}}
16
+ {"time":"2026-05-24T00:08:43.413995537Z","level":"INFO","msg":"server is closed"}
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-23T17:28:20.034273503Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-05-23T17:28:20.495979525Z","level":"INFO","msg":"stream: created new stream","id":"hcp9axm4"}
3
+ {"time":"2026-05-23T17:28:20.496106936Z","level":"INFO","msg":"handler: started","stream_id":"hcp9axm4"}
4
+ {"time":"2026-05-23T17:28:20.496281243Z","level":"INFO","msg":"stream: started","id":"hcp9axm4"}
5
+ {"time":"2026-05-23T17:28:20.496287355Z","level":"INFO","msg":"writer: started","stream_id":"hcp9axm4"}
6
+ {"time":"2026-05-23T17:28:20.496309559Z","level":"INFO","msg":"sender: started","stream_id":"hcp9axm4"}
7
+ {"time":"2026-05-23T17:28:20.630740567Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-05-24T00:08:43.285793193Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-05-24T00:08:43.389899668Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
+ {"time":"2026-05-24T00:08:43.395126237Z","level":"INFO","msg":"stream: closing","id":"hcp9axm4"}
11
+ {"time":"2026-05-24T00:08:43.395149698Z","level":"INFO","msg":"handler: closed","stream_id":"hcp9axm4"}
12
+ {"time":"2026-05-24T00:08:43.395348295Z","level":"INFO","msg":"sender: closed","stream_id":"hcp9axm4"}
13
+ {"time":"2026-05-24T00:08:43.395364596Z","level":"INFO","msg":"stream: closed","id":"hcp9axm4"}
routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-23 17:28:19,720 INFO MainThread:15014 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_setup.py:_flush():81] Configure stats pid to 15014
3
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-23/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug.log
5
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-23/N_2.5/wandb/run-20260523_172819-hcp9axm4/logs/debug-internal.log
6
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'max_steps': None, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.5, 'load_balancing_N': 2.5, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': 150000, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': False, 'model_only_checkpoints': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'routing-evolution', 'run_name': 'routing_N2.5', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-23/N_2.5'}, 'paths': {'output_dir': 'outputs/2026-05-23/N_2.5'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/routing_evolution_exp/train.py'}}
9
+ 2026-05-23 17:28:19,721 INFO MainThread:15014 [wandb_init.py:init():892] starting backend
10
+ 2026-05-23 17:28:20,009 INFO MainThread:15014 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-23 17:28:20,030 INFO MainThread:15014 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-23 17:28:20,037 INFO MainThread:15014 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-23 17:28:20,063 INFO MainThread:15014 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-23 17:28:20,628 INFO MainThread:15014 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-23 17:28:20,879 INFO MainThread:15014 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-23 17:28:20,883 INFO MainThread:15014 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-24 00:08:42,341 INFO MainThread:15014 [wandb_run.py:_finish():2295] finishing run nikita/routing-evolution/hcp9axm4
21
+ 2026-05-24 00:08:42,342 INFO MainThread:15014 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-05-24 00:08:42,342 INFO MainThread:15014 [wandb_run.py:_restore():2476] restore
23
+ 2026-05-24 00:08:42,342 INFO MainThread:15014 [wandb_run.py:_restore():2482] restore done
24
+ 2026-05-24 00:08:43,394 INFO MainThread:15014 [wandb_run.py:_footer_sync_info():3870] logging synced files
routing_rerun/N_4.0/.hydra/config.yaml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 3
6
+ max_steps: null
7
+ batch_size: 8
8
+ eval_batch_size: 24
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ lr_multiplier:
22
+ - 2.0
23
+ - 1.5
24
+ - 1.0
25
+ load_balancing_weight: 0.5
26
+ load_balancing_N: 4.0
27
+ max_grad_norm: 1.0
28
+ use_amp: true
29
+ resume: false
30
+ resume_checkpoint: null
31
+ warmup_model: true
32
+ data:
33
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
34
+ max_context_len: 4096
35
+ max_target_len: 256
36
+ num_workers: 0
37
+ pin_memory: true
38
+ max_train_samples: 150000
39
+ max_val_samples: null
40
+ logging:
41
+ log_interval: 10
42
+ save_interval: 3000
43
+ eval_interval: 1000
44
+ save_every_epoch: false
45
+ model_only_checkpoints: true
46
+ tracking:
47
+ enabled: true
48
+ backend: wandb
49
+ project: routing-evolution
50
+ run_name: routing_N4.0
51
+ entity: null
52
+ base_url: https://wandb.platun0v.ru
53
+ local_dir: ${paths.output_dir}
54
+ paths:
55
+ output_dir: outputs/${now:%Y-%m-%d}/N_${training.load_balancing_N}
56
+ seed: 42
57
+ device: cuda
routing_rerun/N_4.0/.hydra/hydra.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - training.load_balancing_N=4.0
116
+ - tracking.run_name=routing_N4.0
117
+ job:
118
+ name: train
119
+ chdir: false
120
+ override_dirname: tracking.run_name=routing_N4.0,training.load_balancing_N=4.0
121
+ id: ???
122
+ num: ???
123
+ config_name: config
124
+ env_set: {}
125
+ env_copy: []
126
+ config:
127
+ override_dirname:
128
+ kv_sep: '='
129
+ item_sep: ','
130
+ exclude_keys: []
131
+ runtime:
132
+ version: 1.3.2
133
+ version_base: '1.3'
134
+ cwd: /workspace/byte-llms-code/routing_evolution_exp
135
+ config_sources:
136
+ - path: hydra.conf
137
+ schema: pkg
138
+ provider: hydra
139
+ - path: /workspace/byte-llms-code/routing_evolution_exp/configs
140
+ schema: file
141
+ provider: main
142
+ - path: ''
143
+ schema: structured
144
+ provider: schema
145
+ output_dir: /workspace/byte-llms-code/routing_evolution_exp/outputs/2026-05-24/N_4.0
146
+ choices:
147
+ paths: default
148
+ tracking: default
149
+ logging: default
150
+ data: default
151
+ training: default
152
+ model: hnet_xl_code
153
+ hydra/env: default
154
+ hydra/callbacks: null
155
+ hydra/job_logging: default
156
+ hydra/hydra_logging: default
157
+ hydra/hydra_help: default
158
+ hydra/help: default
159
+ hydra/sweeper: basic
160
+ hydra/launcher: basic
161
+ hydra/output: default
162
+ verbose: false
routing_rerun/N_4.0/.hydra/overrides.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ - training.load_balancing_N=4.0
2
+ - tracking.run_name=routing_N4.0
routing_rerun/N_4.0/eval_results/eval_config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ data:
5
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
6
+ max_context_len: 4096
7
+ max_target_len: 256
8
+ num_workers: 0
9
+ pin_memory: true
10
+ max_train_samples: 150000
11
+ max_val_samples: null
12
+ evaluation:
13
+ batch_size: 16
14
+ max_samples: 2000
15
+ compute_bpb: true
16
+ bleu_tokenize: none
17
+ use_amp: true
18
+ save_predictions: true
19
+ generation:
20
+ max_length: 256
21
+ temperature: 0.1
22
+ top_k: 0
23
+ top_p: 1.0
24
+ paths:
25
+ run_dir: outputs/2026-05-24/N_4.0
26
+ eval_initial: true
27
+ eval_final: true
28
+ seed: 42
29
+ device: cuda
routing_rerun/N_4.0/eval_results/metrics_model_best.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_best (outputs/2026-05-24/N_4.0/model_best.pt)
2
+ ================================================================================
3
+
4
+ exact_match: 0.343
5
+ token_accuracy: 0.4532350560559354
6
+ bleu: 17.77479978063419
7
+ bpb: 1.3583921951858478
8
+ num_samples: 2000
9
+ gen_wall_time_s: 335.00358938152203
10
+ gen_samples_per_s: 5.970085286824437
11
+ gen_time_per_sample_ms: 167.50179469076102
12
+ gen_chars_per_s: 161.1206617208179
13
+ gen_batch_mean_ms: 2680.0287150521763
14
+ gen_batch_p50_ms: 2478.22286299197
15
+ gen_batch_p95_ms: 5159.001234790774
16
+ gen_batch_max_ms: 7598.14075101167
17
+ gen_num_batches: 125
routing_rerun/N_4.0/eval_results/metrics_model_final.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_final (outputs/2026-05-24/N_4.0/model_final.pt)
2
+ ================================================================================
3
+
4
+ exact_match: 0.3135
5
+ token_accuracy: 0.44612258253396936
6
+ bleu: 16.075966963029884
7
+ bpb: 1.4377865095442746
8
+ num_samples: 2000
9
+ gen_wall_time_s: 352.92306854663184
10
+ gen_samples_per_s: 5.666957414362783
11
+ gen_time_per_sample_ms: 176.46153427331592
12
+ gen_chars_per_s: 162.49433689943845
13
+ gen_batch_mean_ms: 2823.3845483730547
14
+ gen_batch_p50_ms: 2503.8293699617498
15
+ gen_batch_p95_ms: 6290.377801237628
16
+ gen_batch_max_ms: 9379.81953396229
17
+ gen_num_batches: 125
routing_rerun/N_4.0/eval_results/metrics_pretrained.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: pretrained (/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt)
2
+ ================================================================================
3
+
4
+ exact_match: 0.0
5
+ token_accuracy: 0.46752888904196876
6
+ bleu: 1.0909540935626696
7
+ bpb: 1.8769745278639474
8
+ num_samples: 2000
9
+ gen_wall_time_s: 1711.1607243906474
10
+ gen_samples_per_s: 1.1687972798184751
11
+ gen_time_per_sample_ms: 855.5803621953237
12
+ gen_chars_per_s: 298.4588137866866
13
+ gen_batch_mean_ms: 13689.285795125179
14
+ gen_batch_p50_ms: 13090.146724018268
15
+ gen_batch_p95_ms: 15469.248386588879
16
+ gen_batch_max_ms: 33782.6227840269
17
+ gen_num_batches: 125
routing_rerun/N_4.0/eval_results/predictions_model_best.txt ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_4.0/eval_results/predictions_model_final.txt ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_4.0/eval_results/predictions_pretrained.txt ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_4.0/eval_results/summary.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ EVALUATION SUMMARY
2
+ =============================================================================
3
+
4
+ Checkpoint Exact Match Token Acc BLEU BPB ms/sample
5
+ -----------------------------------------------------------------------------
6
+ pretrained 0.00% 46.75% 1.09 1.8770 855.6
7
+ model_best 34.30% 45.32% 17.77 1.3584 167.5
8
+ model_final 31.35% 44.61% 16.08 1.4378 176.5
routing_rerun/N_4.0/train.log ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_4.0/wandb/debug-internal.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-24T00:08:56.234024021Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-05-24T00:08:56.701391294Z","level":"INFO","msg":"stream: created new stream","id":"j30yot8x"}
3
+ {"time":"2026-05-24T00:08:56.701517641Z","level":"INFO","msg":"handler: started","stream_id":"j30yot8x"}
4
+ {"time":"2026-05-24T00:08:56.701810168Z","level":"INFO","msg":"stream: started","id":"j30yot8x"}
5
+ {"time":"2026-05-24T00:08:56.701847254Z","level":"INFO","msg":"sender: started","stream_id":"j30yot8x"}
6
+ {"time":"2026-05-24T00:08:56.701851256Z","level":"INFO","msg":"writer: started","stream_id":"j30yot8x"}
7
+ {"time":"2026-05-24T00:08:56.8332083Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-05-24T01:16:01.045615831Z","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://wandb.platun0v.ru/files/nikita/routing-evolution/j30yot8x/file_stream","body":"Bad Gateway"}
9
+ {"time":"2026-05-24T05:52:09.949995651Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
10
+ {"time":"2026-05-24T05:52:10.063200014Z","level":"INFO","msg":"handler: operation stats","stats":{}}
11
+ {"time":"2026-05-24T05:52:10.069795312Z","level":"INFO","msg":"stream: closing","id":"j30yot8x"}
12
+ {"time":"2026-05-24T05:52:10.069837927Z","level":"INFO","msg":"handler: closed","stream_id":"j30yot8x"}
13
+ {"time":"2026-05-24T05:52:10.070129075Z","level":"INFO","msg":"sender: closed","stream_id":"j30yot8x"}
14
+ {"time":"2026-05-24T05:52:10.070155803Z","level":"INFO","msg":"stream: closed","id":"j30yot8x"}
routing_rerun/N_4.0/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_setup.py:_flush():81] Configure stats pid to 72724
3
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-24/N_4.0/wandb/run-20260524_000855-j30yot8x/logs/debug.log
5
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-24/N_4.0/wandb/run-20260524_000855-j30yot8x/logs/debug-internal.log
6
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'max_steps': None, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.5, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': 150000, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': False, 'model_only_checkpoints': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'routing-evolution', 'run_name': 'routing_N4.0', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-24/N_4.0'}, 'paths': {'output_dir': 'outputs/2026-05-24/N_4.0'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/routing_evolution_exp/train.py'}}
9
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:init():892] starting backend
10
+ 2026-05-24 00:08:56,207 INFO MainThread:72724 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-24 00:08:56,229 INFO MainThread:72724 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-24 00:08:56,235 INFO MainThread:72724 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-24 00:08:56,260 INFO MainThread:72724 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-24 00:08:56,831 INFO MainThread:72724 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-24 00:08:57,085 INFO MainThread:72724 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-24 05:52:08,772 INFO MainThread:72724 [wandb_run.py:_finish():2295] finishing run nikita/routing-evolution/j30yot8x
21
+ 2026-05-24 05:52:08,772 INFO MainThread:72724 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-05-24 05:52:08,772 INFO MainThread:72724 [wandb_run.py:_restore():2476] restore
23
+ 2026-05-24 05:52:08,773 INFO MainThread:72724 [wandb_run.py:_restore():2482] restore done
24
+ 2026-05-24 05:52:10,068 INFO MainThread:72724 [wandb_run.py:_footer_sync_info():3870] logging synced files
routing_rerun/N_4.0/wandb/run-20260524_000855-j30yot8x/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_setup.py:_flush():81] Configure stats pid to 72724
3
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-24/N_4.0/wandb/run-20260524_000855-j30yot8x/logs/debug.log
5
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-24/N_4.0/wandb/run-20260524_000855-j30yot8x/logs/debug-internal.log
6
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'max_steps': None, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.5, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': 150000, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': False, 'model_only_checkpoints': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'routing-evolution', 'run_name': 'routing_N4.0', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-24/N_4.0'}, 'paths': {'output_dir': 'outputs/2026-05-24/N_4.0'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/routing_evolution_exp/train.py'}}
9
+ 2026-05-24 00:08:55,908 INFO MainThread:72724 [wandb_init.py:init():892] starting backend
10
+ 2026-05-24 00:08:56,207 INFO MainThread:72724 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-24 00:08:56,229 INFO MainThread:72724 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-24 00:08:56,235 INFO MainThread:72724 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-24 00:08:56,260 INFO MainThread:72724 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-24 00:08:56,831 INFO MainThread:72724 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-24 00:08:57,079 INFO MainThread:72724 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-24 00:08:57,085 INFO MainThread:72724 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-24 05:52:08,772 INFO MainThread:72724 [wandb_run.py:_finish():2295] finishing run nikita/routing-evolution/j30yot8x
21
+ 2026-05-24 05:52:08,772 INFO MainThread:72724 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-05-24 05:52:08,772 INFO MainThread:72724 [wandb_run.py:_restore():2476] restore
23
+ 2026-05-24 05:52:08,773 INFO MainThread:72724 [wandb_run.py:_restore():2482] restore done
24
+ 2026-05-24 05:52:10,068 INFO MainThread:72724 [wandb_run.py:_footer_sync_info():3870] logging synced files
routing_rerun/N_6.0/train.log ADDED
The diff for this file is too large to render. See raw diff
 
routing_rerun/N_8.0/train.log ADDED
The diff for this file is too large to render. See raw diff