narySt commited on
Commit
a0fc652
·
verified ·
1 Parent(s): b6b4484

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. hnet_train_rerun/21-40-57/.hydra/config.yaml +55 -0
  3. hnet_train_rerun/21-40-57/.hydra/hydra.yaml +160 -0
  4. hnet_train_rerun/21-40-57/.hydra/overrides.yaml +1 -0
  5. hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_checkpoint_latest-checkpoint.txt +17 -0
  6. hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_checkpoint_step_27000-checkpoint.txt +17 -0
  7. hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_checkpoint_step_3000-checkpoint.txt +17 -0
  8. hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_model_best-checkpoint.txt +17 -0
  9. hnet_train_rerun/21-40-57/eval_results/eval_config.yaml +29 -0
  10. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_latest.txt +17 -0
  11. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_12000.txt +17 -0
  12. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_15000.txt +17 -0
  13. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_18000.txt +17 -0
  14. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_21000.txt +17 -0
  15. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_24000.txt +17 -0
  16. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_27000.txt +17 -0
  17. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_3000.txt +17 -0
  18. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_6000.txt +17 -0
  19. hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_9000.txt +17 -0
  20. hnet_train_rerun/21-40-57/eval_results/metrics_initial_checkpoint.txt +17 -0
  21. hnet_train_rerun/21-40-57/eval_results/metrics_model_best.txt +17 -0
  22. hnet_train_rerun/21-40-57/eval_results/metrics_model_final.txt +17 -0
  23. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_latest.txt +0 -0
  24. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_12000.txt +0 -0
  25. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_15000.txt +0 -0
  26. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_18000.txt +0 -0
  27. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_21000.txt +0 -0
  28. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_24000.txt +0 -0
  29. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_27000.txt +0 -0
  30. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_3000.txt +0 -0
  31. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_6000.txt +0 -0
  32. hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_9000.txt +0 -0
  33. hnet_train_rerun/21-40-57/eval_results/predictions_initial_checkpoint.txt +3 -0
  34. hnet_train_rerun/21-40-57/eval_results/predictions_model_best.txt +0 -0
  35. hnet_train_rerun/21-40-57/eval_results/predictions_model_final.txt +0 -0
  36. hnet_train_rerun/21-40-57/eval_results/summary.txt +18 -0
  37. hnet_train_rerun/21-40-57/model_best.pt +3 -0
  38. hnet_train_rerun/21-40-57/model_final.pt +3 -0
  39. hnet_train_rerun/21-40-57/train.log +0 -0
  40. hnet_train_rerun/21-40-57/wandb/debug-internal.log +13 -0
  41. hnet_train_rerun/21-40-57/wandb/debug.log +24 -0
  42. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/code/code_completion_exp/train_hnet/train.py +284 -0
  43. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/config.yaml +129 -0
  44. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/output.log +0 -0
  45. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/requirements.txt +245 -0
  46. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/wandb-metadata.json +40 -0
  47. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/wandb-summary.json +1 -0
  48. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug-core.log +16 -0
  49. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug-internal.log +13 -0
  50. hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug.log +24 -0
.gitattributes CHANGED
@@ -54,3 +54,5 @@ routing_rerun/N_8.0/wandb/run-20260524_111710-wj0poul7/run-wj0poul7.wandb filter
54
  routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/run-hcp9axm4.wandb filter=lfs diff=lfs merge=lfs -text
55
  routing_rerun/N_4.0/wandb/run-20260524_000855-j30yot8x/run-j30yot8x.wandb filter=lfs diff=lfs merge=lfs -text
56
  routing_rerun/N_6.0/wandb/run-20260524_055222-tjovd8fs/run-tjovd8fs.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
54
  routing_rerun/N_2.5/wandb/run-20260523_172819-hcp9axm4/run-hcp9axm4.wandb filter=lfs diff=lfs merge=lfs -text
55
  routing_rerun/N_4.0/wandb/run-20260524_000855-j30yot8x/run-j30yot8x.wandb filter=lfs diff=lfs merge=lfs -text
56
  routing_rerun/N_6.0/wandb/run-20260524_055222-tjovd8fs/run-tjovd8fs.wandb filter=lfs diff=lfs merge=lfs -text
57
+ hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/run-3nxcy117.wandb filter=lfs diff=lfs merge=lfs -text
58
+ hnet_train_rerun/21-40-57/eval_results/predictions_initial_checkpoint.txt filter=lfs diff=lfs merge=lfs -text
hnet_train_rerun/21-40-57/.hydra/config.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 3
6
+ batch_size: 8
7
+ eval_batch_size: 24
8
+ gradient_accumulation_steps: 4
9
+ lr: 0.0001
10
+ weight_decay: 0.1
11
+ betas:
12
+ - 0.9
13
+ - 0.95
14
+ eps: 1.0e-08
15
+ lr_scheduler: wsd
16
+ warmup_ratio: 0.1
17
+ decay_ratio: 0.2
18
+ warmup_steps: 100
19
+ min_lr_ratio: 0.1
20
+ lr_multiplier:
21
+ - 2.0
22
+ - 1.5
23
+ - 1.0
24
+ load_balancing_weight: 0.01
25
+ load_balancing_N: 4.0
26
+ max_grad_norm: 1.0
27
+ use_amp: true
28
+ resume: false
29
+ resume_checkpoint: null
30
+ warmup_model: true
31
+ data:
32
+ path: ${oc.env:PROJECT_ROOT}/code_completion_exp/datasets/data_V4_full
33
+ max_context_len: 4096
34
+ max_target_len: 256
35
+ num_workers: 0
36
+ pin_memory: true
37
+ max_train_samples: null
38
+ max_val_samples: null
39
+ logging:
40
+ log_interval: 10
41
+ save_interval: 3000
42
+ eval_interval: 2000
43
+ save_every_epoch: false
44
+ tracking:
45
+ enabled: true
46
+ backend: wandb
47
+ project: hnet_rerun
48
+ run_name: hnet_train
49
+ entity: null
50
+ base_url: https://wandb.platun0v.ru
51
+ local_dir: ${paths.output_dir}
52
+ paths:
53
+ output_dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
54
+ seed: 42
55
+ device: cuda
hnet_train_rerun/21-40-57/.hydra/hydra.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: train
117
+ chdir: false
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: config
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.3'
132
+ cwd: /workspace/byte-llms-code/code_completion_exp/train_hnet
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /workspace/byte-llms-code/code_completion_exp/train_hnet/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /workspace/byte-llms-code/code_completion_exp/train_hnet/outputs/2026-05-24/21-40-57
144
+ choices:
145
+ paths: default
146
+ tracking: wandb
147
+ logging: default
148
+ data: default
149
+ training: default
150
+ model: hnet_xl_code
151
+ hydra/env: default
152
+ hydra/callbacks: null
153
+ hydra/job_logging: default
154
+ hydra/hydra_logging: default
155
+ hydra/hydra_help: default
156
+ hydra/help: default
157
+ hydra/sweeper: basic
158
+ hydra/launcher: basic
159
+ hydra/output: default
160
+ verbose: false
hnet_train_rerun/21-40-57/.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_checkpoint_latest-checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_latest.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.360191463901077
5
+ token_accuracy: 0.47747439249588164
6
+ bleu: 19.726981132809208
7
+ bpb: 1.3918926184016422
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6738.63248833298
10
+ gen_samples_per_s: 5.208475170706725
11
+ gen_time_per_sample_ms: 191.99477144945524
12
+ gen_chars_per_s: 151.54513942822675
13
+ gen_batch_mean_ms: 3071.391289121686
14
+ gen_batch_p50_ms: 2684.8264584841672
15
+ gen_batch_p95_ms: 7208.787969502737
16
+ gen_batch_max_ms: 12667.352943972219
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_checkpoint_step_27000-checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_27000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.36064733033221263
5
+ token_accuracy: 0.4773234898368026
6
+ bleu: 19.665145938230435
7
+ bpb: 1.3918926184016422
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6899.235165899969
10
+ gen_samples_per_s: 5.087230563392696
11
+ gen_time_per_sample_ms: 196.57060704028632
12
+ gen_chars_per_s: 148.02785750045956
13
+ gen_batch_mean_ms: 3144.5921448951544
14
+ gen_batch_p50_ms: 2730.471632006811
15
+ gen_batch_p95_ms: 7366.315296370885
16
+ gen_batch_max_ms: 12734.633811982349
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_checkpoint_step_3000-checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_3000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.36452219499686594
5
+ token_accuracy: 0.4787851175154458
6
+ bleu: 20.091589434631988
7
+ bpb: 1.3343184788590108
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6209.05275386211
10
+ gen_samples_per_s: 5.652714092043846
11
+ gen_time_per_sample_ms: 176.90616997726679
12
+ gen_chars_per_s: 156.16971516256734
13
+ gen_batch_mean_ms: 2830.014928834143
14
+ gen_batch_p50_ms: 2555.72874849895
15
+ gen_batch_p95_ms: 6390.771690043039
16
+ gen_batch_max_ms: 9818.601018982008
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/.ipynb_checkpoints/metrics_model_best-checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_best.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3734400820559576
5
+ token_accuracy: 0.4819705178766448
6
+ bleu: 20.14870477803318
7
+ bpb: 1.3264302330352213
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6241.421260637813
10
+ gen_samples_per_s: 5.623398667439622
11
+ gen_time_per_sample_ms: 177.82840220633122
12
+ gen_chars_per_s: 156.0005260565445
13
+ gen_batch_mean_ms: 2844.76812244203
14
+ gen_batch_p50_ms: 2527.919982007006
15
+ gen_batch_p95_ms: 6982.412858091992
16
+ gen_batch_max_ms: 10548.128249007277
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/eval_config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ data:
5
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
6
+ max_context_len: 4096
7
+ max_target_len: 256
8
+ num_workers: 0
9
+ pin_memory: true
10
+ max_train_samples: null
11
+ max_val_samples: null
12
+ paths:
13
+ checkpoints_dir: outputs/2026-05-24/21-40-57
14
+ initial_checkpoint: auto
15
+ output_dir: outputs/2026-05-24/21-40-57/eval_results
16
+ evaluation:
17
+ batch_size: 16
18
+ max_samples: null
19
+ compute_bpb: true
20
+ bleu_tokenize: none
21
+ use_amp: true
22
+ save_predictions: true
23
+ generation:
24
+ max_length: 256
25
+ temperature: 0.1
26
+ top_k: 0
27
+ top_p: 1.0
28
+ seed: 42
29
+ device: cuda
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_latest.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_latest.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.360191463901077
5
+ token_accuracy: 0.47747439249588164
6
+ bleu: 19.726981132809208
7
+ bpb: 1.3918926184016422
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6738.63248833298
10
+ gen_samples_per_s: 5.208475170706725
11
+ gen_time_per_sample_ms: 191.99477144945524
12
+ gen_chars_per_s: 151.54513942822675
13
+ gen_batch_mean_ms: 3071.391289121686
14
+ gen_batch_p50_ms: 2684.8264584841672
15
+ gen_batch_p95_ms: 7208.787969502737
16
+ gen_batch_max_ms: 12667.352943972219
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_12000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_12000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3741523733546071
5
+ token_accuracy: 0.48474383789958975
6
+ bleu: 20.866813431088616
7
+ bpb: 1.335828999773514
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6534.712194961612
10
+ gen_samples_per_s: 5.37100930429059
11
+ gen_time_per_sample_ms: 186.18474542599614
12
+ gen_chars_per_s: 152.32575977370146
13
+ gen_batch_mean_ms: 2978.44676160511
14
+ gen_batch_p50_ms: 2646.6345914814156
15
+ gen_batch_p95_ms: 7165.557659140904
16
+ gen_batch_max_ms: 10396.650541981217
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_15000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_15000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.374294831614337
5
+ token_accuracy: 0.48255671666768235
6
+ bleu: 20.82318093842354
7
+ bpb: 1.3339916854689853
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6575.688067426148
10
+ gen_samples_per_s: 5.3375402908578105
11
+ gen_time_per_sample_ms: 187.35221572243853
12
+ gen_chars_per_s: 149.28567017386504
13
+ gen_batch_mean_ms: 2997.1230936308793
14
+ gen_batch_p50_ms: 2574.5291139755864
15
+ gen_batch_p95_ms: 7272.849028115161
16
+ gen_batch_max_ms: 10864.763604011387
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_18000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_18000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3744087982221209
5
+ token_accuracy: 0.4840512333874068
6
+ bleu: 21.172392577542894
7
+ bpb: 1.3298998572417937
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6599.062013073475
10
+ gen_samples_per_s: 5.318634668149347
11
+ gen_time_per_sample_ms: 188.0181780464264
12
+ gen_chars_per_s: 149.91266910965837
13
+ gen_batch_mean_ms: 3007.7766695868163
14
+ gen_batch_p50_ms: 2595.0252109905705
15
+ gen_batch_p95_ms: 7258.518203350831
16
+ gen_batch_max_ms: 9577.27041101316
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_21000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_21000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3584819647843182
5
+ token_accuracy: 0.47367861022520286
6
+ bleu: 19.489348058114224
7
+ bpb: 1.4021340103652056
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6887.526788596122
10
+ gen_samples_per_s: 5.095878546434516
11
+ gen_time_per_sample_ms: 196.23701602929287
12
+ gen_chars_per_s: 147.68871776792565
13
+ gen_batch_mean_ms: 3139.255601000967
14
+ gen_batch_p50_ms: 2698.8774175115395
15
+ gen_batch_p95_ms: 7320.19043576729
16
+ gen_batch_max_ms: 12885.085268993862
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_24000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_24000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3584819647843182
5
+ token_accuracy: 0.4737385843589394
6
+ bleu: 19.275525429511855
7
+ bpb: 1.390391807400738
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6441.2251053551445
10
+ gen_samples_per_s: 5.44896342325003
11
+ gen_time_per_sample_ms: 183.52114380748603
12
+ gen_chars_per_s: 154.68423843340665
13
+ gen_batch_mean_ms: 2935.836419943092
14
+ gen_batch_p50_ms: 2630.608787992969
15
+ gen_batch_p95_ms: 6906.270672706885
16
+ gen_batch_max_ms: 12607.093763013836
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_27000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_27000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.36064733033221263
5
+ token_accuracy: 0.4773234898368026
6
+ bleu: 19.665145938230435
7
+ bpb: 1.3918926184016422
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6899.235165899969
10
+ gen_samples_per_s: 5.087230563392696
11
+ gen_time_per_sample_ms: 196.57060704028632
12
+ gen_chars_per_s: 148.02785750045956
13
+ gen_batch_mean_ms: 3144.5921448951544
14
+ gen_batch_p50_ms: 2730.471632006811
15
+ gen_batch_p95_ms: 7366.315296370885
16
+ gen_batch_max_ms: 12734.633811982349
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_3000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_3000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.36452219499686594
5
+ token_accuracy: 0.4787851175154458
6
+ bleu: 20.091589434631988
7
+ bpb: 1.3343184788590108
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6209.05275386211
10
+ gen_samples_per_s: 5.652714092043846
11
+ gen_time_per_sample_ms: 176.90616997726679
12
+ gen_chars_per_s: 156.16971516256734
13
+ gen_batch_mean_ms: 2830.014928834143
14
+ gen_batch_p50_ms: 2555.72874849895
15
+ gen_batch_p95_ms: 6390.771690043039
16
+ gen_batch_max_ms: 9818.601018982008
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_6000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_6000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.37295572397287596
5
+ token_accuracy: 0.4793123094974845
6
+ bleu: 20.604051625274728
7
+ bpb: 1.3291212170475468
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6369.97152560309
10
+ gen_samples_per_s: 5.509914739638813
11
+ gen_time_per_sample_ms: 181.49101161328537
12
+ gen_chars_per_s: 152.10209278106132
13
+ gen_batch_mean_ms: 2903.3598567014997
14
+ gen_batch_p50_ms: 2524.245548993349
15
+ gen_batch_p95_ms: 7080.392475571716
16
+ gen_batch_max_ms: 9737.063668959308
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_checkpoint_step_9000.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: checkpoint_step_9000.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.37175907459114477
5
+ token_accuracy: 0.48313711151029376
6
+ bleu: 20.088935445782226
7
+ bpb: 1.3244859882122255
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6331.07599858538
10
+ gen_samples_per_s: 5.543765389618182
11
+ gen_time_per_sample_ms: 180.38281379524133
12
+ gen_chars_per_s: 155.2655504719327
13
+ gen_batch_mean_ms: 2885.631722235816
14
+ gen_batch_p50_ms: 2578.9438789943233
15
+ gen_batch_p95_ms: 6757.976525239064
16
+ gen_batch_max_ms: 10634.354435955174
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_initial_checkpoint.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: hnet_2stage_XL_code.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.0
5
+ token_accuracy: 0.47427544958835494
6
+ bleu: 1.153252081120271
7
+ bpb: 1.8748032956517047
8
+ num_samples: 35098
9
+ gen_wall_time_s: 28995.99217226659
10
+ gen_samples_per_s: 1.2104431464693841
11
+ gen_time_per_sample_ms: 826.1437168005751
12
+ gen_chars_per_s: 308.8898957755469
13
+ gen_batch_mean_ms: 13216.040187906377
14
+ gen_batch_p50_ms: 13210.045973974047
15
+ gen_batch_p95_ms: 13516.947992166388
16
+ gen_batch_max_ms: 34002.42500199238
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_model_best.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_best.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3734400820559576
5
+ token_accuracy: 0.4819705178766448
6
+ bleu: 20.14870477803318
7
+ bpb: 1.3264302330352213
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6241.421260637813
10
+ gen_samples_per_s: 5.623398667439622
11
+ gen_time_per_sample_ms: 177.82840220633122
12
+ gen_chars_per_s: 156.0005260565445
13
+ gen_batch_mean_ms: 2844.76812244203
14
+ gen_batch_p50_ms: 2527.919982007006
15
+ gen_batch_p95_ms: 6982.412858091992
16
+ gen_batch_max_ms: 10548.128249007277
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/metrics_model_final.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Checkpoint: model_final.pt
2
+ ================================================================================
3
+
4
+ exact_match: 0.3617015214542139
5
+ token_accuracy: 0.4779309697720693
6
+ bleu: 19.715273255435186
7
+ bpb: 1.392667822078947
8
+ num_samples: 35098
9
+ gen_wall_time_s: 6712.276489993092
10
+ gen_samples_per_s: 5.228926438344038
11
+ gen_time_per_sample_ms: 191.2438455180663
12
+ gen_chars_per_s: 151.35192978337005
13
+ gen_batch_mean_ms: 3059.3785277999505
14
+ gen_batch_p50_ms: 2652.1117465163115
15
+ gen_batch_p95_ms: 7237.394248164492
16
+ gen_batch_max_ms: 12913.40813797433
17
+ gen_num_batches: 2194
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_latest.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_12000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_15000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_18000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_21000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_24000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_27000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_3000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_6000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_checkpoint_step_9000.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_initial_checkpoint.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ab78a591f30ed69ed1b65b60d94f0483695a8d7ed94fe1716ae5f4d755fe002
3
+ size 17000214
hnet_train_rerun/21-40-57/eval_results/predictions_model_best.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/predictions_model_final.txt ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/eval_results/summary.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EVALUATION SUMMARY
2
+ ==================================================================================================
3
+
4
+ Checkpoint Exact Match Token Acc BLEU BPB ms/sample samp/s
5
+ --------------------------------------------------------------------------------------------------
6
+ initial_checkpoint 0.00% 47.43% 1.15 1.87 826.1 1.21
7
+ checkpoint_latest 36.02% 47.75% 19.73 1.39 192.0 5.21
8
+ checkpoint_step_12000 37.42% 48.47% 20.87 1.34 186.2 5.37
9
+ checkpoint_step_15000 37.43% 48.26% 20.82 1.33 187.4 5.34
10
+ checkpoint_step_18000 37.44% 48.41% 21.17 1.33 188.0 5.32
11
+ checkpoint_step_21000 35.85% 47.37% 19.49 1.40 196.2 5.10
12
+ checkpoint_step_24000 35.85% 47.37% 19.28 1.39 183.5 5.45
13
+ checkpoint_step_27000 36.06% 47.73% 19.67 1.39 196.6 5.09
14
+ checkpoint_step_3000 36.45% 47.88% 20.09 1.33 176.9 5.65
15
+ checkpoint_step_6000 37.30% 47.93% 20.60 1.33 181.5 5.51
16
+ checkpoint_step_9000 37.18% 48.31% 20.09 1.32 180.4 5.54
17
+ model_best 37.34% 48.20% 20.15 1.33 177.8 5.62
18
+ model_final 36.17% 47.79% 19.72 1.39 191.2 5.23
hnet_train_rerun/21-40-57/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf1579f27c0c45243bada34e891daf065c4d2a608dd49f80f8b9e1a0addd7479
3
+ size 3315165139
hnet_train_rerun/21-40-57/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293b790edc4e3ba0f5c7ae637cea1e677509621fd740389cf3f37b59b6baaf0b
3
+ size 3315165484
hnet_train_rerun/21-40-57/train.log ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/wandb/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-24T21:40:59.042920084Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-05-24T21:40:59.47364371Z","level":"INFO","msg":"stream: created new stream","id":"3nxcy117"}
3
+ {"time":"2026-05-24T21:40:59.473752145Z","level":"INFO","msg":"handler: started","stream_id":"3nxcy117"}
4
+ {"time":"2026-05-24T21:40:59.473916706Z","level":"INFO","msg":"stream: started","id":"3nxcy117"}
5
+ {"time":"2026-05-24T21:40:59.473946539Z","level":"INFO","msg":"sender: started","stream_id":"3nxcy117"}
6
+ {"time":"2026-05-24T21:40:59.473947844Z","level":"INFO","msg":"writer: started","stream_id":"3nxcy117"}
7
+ {"time":"2026-05-24T21:40:59.608257445Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-05-25T09:11:44.25084313Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-05-25T09:11:44.382625783Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
+ {"time":"2026-05-25T09:11:44.387865173Z","level":"INFO","msg":"stream: closing","id":"3nxcy117"}
11
+ {"time":"2026-05-25T09:11:44.387891658Z","level":"INFO","msg":"handler: closed","stream_id":"3nxcy117"}
12
+ {"time":"2026-05-25T09:11:44.388086651Z","level":"INFO","msg":"sender: closed","stream_id":"3nxcy117"}
13
+ {"time":"2026-05-25T09:11:44.388103059Z","level":"INFO","msg":"stream: closed","id":"3nxcy117"}
hnet_train_rerun/21-40-57/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_setup.py:_flush():81] Configure stats pid to 83034
3
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-24/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug.log
5
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-24/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug-internal.log
6
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 2000, 'save_every_epoch': False}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'hnet_rerun', 'run_name': 'hnet_train', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-24/21-40-57'}, 'paths': {'output_dir': 'outputs/2026-05-24/21-40-57'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:init():892] starting backend
10
+ 2026-05-24 21:40:59,018 INFO MainThread:83034 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-24 21:40:59,039 INFO MainThread:83034 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-24 21:40:59,047 INFO MainThread:83034 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-24 21:40:59,077 INFO MainThread:83034 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-24 21:40:59,606 INFO MainThread:83034 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-24 21:40:59,860 INFO MainThread:83034 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-24 21:40:59,860 INFO MainThread:83034 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-24 21:40:59,860 INFO MainThread:83034 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-24 21:40:59,861 INFO MainThread:83034 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-24 21:40:59,865 INFO MainThread:83034 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-25 09:11:42,798 INFO MainThread:83034 [wandb_run.py:_finish():2295] finishing run nikita/hnet_rerun/3nxcy117
21
+ 2026-05-25 09:11:42,798 INFO MainThread:83034 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-05-25 09:11:42,799 INFO MainThread:83034 [wandb_run.py:_restore():2476] restore
23
+ 2026-05-25 09:11:42,799 INFO MainThread:83034 [wandb_run.py:_restore():2482] restore done
24
+ 2026-05-25 09:11:44,386 INFO MainThread:83034 [wandb_run.py:_footer_sync_info():3870] logging synced files
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/code/code_completion_exp/train_hnet/train.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для HNet модели на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=hnet_small
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ from pathlib import Path
33
+
34
+ import torch
35
+ import hydra
36
+ from hydra.core.hydra_config import HydraConfig
37
+ from omegaconf import DictConfig, OmegaConf
38
+ from accelerate import Accelerator
39
+ from accelerate.utils import set_seed as accelerate_set_seed
40
+
41
+ # HNet imports
42
+ from hnet.load_utils import load_from_pretrained, load_from_config
43
+ from hnet.utils.tokenizers import ByteTokenizer
44
+ from hnet.utils.train import group_params
45
+
46
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
47
+ import sys
48
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
49
+
50
+ # Shared training library
51
+ from training_lib.utils import log_message
52
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
53
+ from training_lib.schedulers import get_lr_scheduler
54
+ from training_lib.tracking import init_tracking, finish_tracking
55
+ from training_lib.hnet.train_loop import train_epoch
56
+ from training_lib.hnet.data import create_dataloaders
57
+
58
+
59
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
60
+ def main(cfg: DictConfig):
61
+ """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
62
+
63
+ # === Accelerator Setup ===
64
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
65
+
66
+ accelerator = Accelerator(
67
+ mixed_precision=mixed_precision,
68
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
69
+ )
70
+
71
+ # === Setup ===
72
+ accelerate_set_seed(cfg.seed)
73
+
74
+ if cfg.paths.output_dir is None:
75
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
76
+
77
+ OmegaConf.resolve(cfg)
78
+
79
+ log_message(
80
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
81
+ cfg,
82
+ accelerator,
83
+ )
84
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
85
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
86
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
87
+
88
+ log_message("=" * 60, cfg, accelerator)
89
+ log_message(
90
+ "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
91
+ )
92
+ log_message("=" * 60, cfg, accelerator)
93
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
94
+
95
+ # === Experiment Tracking Init ===
96
+ init_tracking(cfg, accelerator)
97
+
98
+ # === Tokenizer ===
99
+ log_message("Initializing tokenizer...", cfg, accelerator)
100
+ tokenizer = ByteTokenizer()
101
+
102
+ # === Model ===
103
+ log_message("Loading model...", cfg, accelerator)
104
+ if cfg.model.checkpoint_path:
105
+ model = load_from_pretrained(
106
+ model_path=cfg.model.checkpoint_path,
107
+ model_config_path=cfg.model.config_path,
108
+ )
109
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
110
+ else:
111
+ model = load_from_config(
112
+ model_config_path=cfg.model.config_path,
113
+ device="cpu",
114
+ )
115
+ model.init_weights()
116
+ log_message("Initialized from scratch", cfg, accelerator)
117
+
118
+ model.train()
119
+
120
+ # LR multiplier для разны�� стадий (до prepare!)
121
+ lr_multiplier = list(cfg.training.lr_multiplier)
122
+ model.apply_lr_multiplier(lr_multiplier)
123
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
124
+
125
+ # Warmup для Triton kernels
126
+ if cfg.training.warmup_model:
127
+ log_message("Warming up model...", cfg, accelerator)
128
+ model = model.to(accelerator.device)
129
+ model.warmup(verbose=accelerator.is_main_process)
130
+
131
+ # Log model info
132
+ total_params = sum(p.numel() for p in model.parameters())
133
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
134
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
135
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
136
+
137
+ # === Data ===
138
+ log_message("Creating dataloaders...", cfg, accelerator)
139
+ dataloaders = create_dataloaders(cfg, tokenizer)
140
+
141
+ train_dataloader = dataloaders["train"]
142
+ val_dataloader = dataloaders.get("validation", None)
143
+
144
+ log_message(
145
+ f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
146
+ )
147
+ log_message(
148
+ f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
149
+ cfg,
150
+ accelerator,
151
+ )
152
+
153
+ if val_dataloader:
154
+ log_message(
155
+ f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
156
+ )
157
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
158
+ else:
159
+ log_message("No validation dataset found", cfg, accelerator)
160
+
161
+ # === Optimizer ===
162
+ log_message("Creating optimizer...", cfg, accelerator)
163
+ param_groups = group_params(model)
164
+
165
+ for group in param_groups:
166
+ if "lr" not in group:
167
+ group["lr"] = cfg.training.lr
168
+ else:
169
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
170
+ if "weight_decay" not in group:
171
+ group["weight_decay"] = cfg.training.weight_decay
172
+
173
+ optimizer = torch.optim.AdamW(
174
+ param_groups,
175
+ lr=cfg.training.lr,
176
+ betas=tuple(cfg.training.betas),
177
+ eps=cfg.training.eps,
178
+ )
179
+
180
+ # === Scheduler ===
181
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
182
+ total_steps = (
183
+ cfg.training.epochs
184
+ * steps_per_epoch
185
+ // cfg.training.gradient_accumulation_steps
186
+ )
187
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
188
+
189
+ log_message(
190
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
191
+ cfg,
192
+ accelerator,
193
+ )
194
+
195
+ # === Accelerate Prepare ===
196
+ log_message(
197
+ "Preparing model, optimizer, and dataloaders with Accelerate...",
198
+ cfg,
199
+ accelerator,
200
+ )
201
+
202
+ if val_dataloader is not None:
203
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
204
+ accelerator.prepare(
205
+ model, optimizer, train_dataloader, val_dataloader, scheduler
206
+ )
207
+ )
208
+ else:
209
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
210
+ model, optimizer, train_dataloader, scheduler
211
+ )
212
+
213
+ log_message(
214
+ f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
215
+ cfg,
216
+ accelerator,
217
+ )
218
+
219
+ # === Resume ===
220
+ global_step = 0
221
+ start_epoch = 1
222
+
223
+ if cfg.training.resume and cfg.training.resume_checkpoint:
224
+ global_step, start_epoch = load_checkpoint(
225
+ model,
226
+ optimizer,
227
+ scheduler,
228
+ cfg.training.resume_checkpoint,
229
+ cfg,
230
+ accelerator,
231
+ )
232
+ start_epoch += 1
233
+
234
+ # === Training Loop ===
235
+ log_message("Starting training...", cfg, accelerator)
236
+
237
+ best_val_loss = float("inf")
238
+
239
+ try:
240
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
241
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
242
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
243
+ log_message(f"{'=' * 60}", cfg, accelerator)
244
+
245
+ global_step, best_val_loss = train_epoch(
246
+ model=model,
247
+ dataloader=train_dataloader,
248
+ optimizer=optimizer,
249
+ scheduler=scheduler,
250
+ cfg=cfg,
251
+ epoch=epoch,
252
+ global_step=global_step,
253
+ accelerator=accelerator,
254
+ val_dataloader=val_dataloader,
255
+ best_val_loss=best_val_loss,
256
+ )
257
+
258
+ if cfg.logging.save_every_epoch:
259
+ save_checkpoint(
260
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
261
+ )
262
+
263
+ except KeyboardInterrupt:
264
+ log_message("Training interrupted by user", cfg, accelerator)
265
+ save_checkpoint(
266
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
267
+ )
268
+
269
+ # === Final Save ===
270
+ log_message("\nTraining completed!", cfg, accelerator)
271
+
272
+ if accelerator.is_main_process:
273
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
274
+ unwrapped_model = accelerator.unwrap_model(model)
275
+ torch.save(unwrapped_model.state_dict(), final_model_path)
276
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
277
+
278
+ accelerator.wait_for_everyone()
279
+ accelerator.end_training()
280
+ finish_tracking()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.0
4
+ code_path: code/code_completion_exp/train_hnet/train.py
5
+ e:
6
+ 8tbm13mz63ggpgppnq4dj4hkvc961ndv:
7
+ codePath: code_completion_exp/train_hnet/train.py
8
+ codePathLocal: train.py
9
+ cpu_count: 36
10
+ cpu_count_logical: 72
11
+ cudaVersion: "13.2"
12
+ disk:
13
+ /:
14
+ total: "234075717632"
15
+ used: "19460378624"
16
+ email: nikita@local.ru
17
+ executable: /venv/bytellm/bin/python
18
+ git:
19
+ commit: b57d04cd9f885094fe832cedecaaa9b128098ca3
20
+ remote: https://github.com/naryst/byte-llms-code.git
21
+ gpu: NVIDIA A100 80GB PCIe
22
+ gpu_count: 1
23
+ gpu_nvidia:
24
+ - architecture: Ampere
25
+ cudaCores: 6912
26
+ memoryTotal: "85899345920"
27
+ name: NVIDIA A100 80GB PCIe
28
+ uuid: GPU-ec9764c4-224a-9508-002f-de7762d78498
29
+ host: 1a023b226280
30
+ memory:
31
+ total: "404274495488"
32
+ os: Linux-6.8.0-117-generic-x86_64-with-glibc2.39
33
+ program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
34
+ python: CPython 3.12.0
35
+ root: outputs/2026-05-24/21-40-57
36
+ startedAt: "2026-05-24T21:40:58.723386Z"
37
+ writerId: 8tbm13mz63ggpgppnq4dj4hkvc961ndv
38
+ m: []
39
+ python_version: 3.12.0
40
+ t:
41
+ "1":
42
+ - 1
43
+ - 11
44
+ - 49
45
+ - 50
46
+ - 51
47
+ - 71
48
+ - 105
49
+ "2":
50
+ - 1
51
+ - 11
52
+ - 49
53
+ - 50
54
+ - 51
55
+ - 71
56
+ - 105
57
+ "3":
58
+ - 2
59
+ - 13
60
+ - 16
61
+ - 61
62
+ "4": 3.12.0
63
+ "5": 0.24.0
64
+ "6": 4.57.6
65
+ "12": 0.24.0
66
+ "13": linux-x86_64
67
+ data:
68
+ value:
69
+ max_context_len: 4096
70
+ max_target_len: 256
71
+ max_train_samples: null
72
+ max_val_samples: null
73
+ num_workers: 0
74
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
75
+ pin_memory: true
76
+ device:
77
+ value: cuda
78
+ logging:
79
+ value:
80
+ eval_interval: 2000
81
+ log_interval: 10
82
+ save_every_epoch: false
83
+ save_interval: 3000
84
+ model:
85
+ value:
86
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
87
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
88
+ paths:
89
+ value:
90
+ output_dir: outputs/2026-05-24/21-40-57
91
+ seed:
92
+ value: 42
93
+ tracking:
94
+ value:
95
+ backend: wandb
96
+ base_url: https://wandb.platun0v.ru
97
+ enabled: true
98
+ entity: null
99
+ local_dir: outputs/2026-05-24/21-40-57
100
+ project: hnet_rerun
101
+ run_name: hnet_train
102
+ training:
103
+ value:
104
+ batch_size: 8
105
+ betas:
106
+ - 0.9
107
+ - 0.95
108
+ decay_ratio: 0.2
109
+ epochs: 3
110
+ eps: 1e-08
111
+ eval_batch_size: 24
112
+ gradient_accumulation_steps: 4
113
+ load_balancing_N: 4
114
+ load_balancing_weight: 0.01
115
+ lr: 0.0001
116
+ lr_multiplier:
117
+ - 2
118
+ - 1.5
119
+ - 1
120
+ lr_scheduler: wsd
121
+ max_grad_norm: 1
122
+ min_lr_ratio: 0.1
123
+ resume: false
124
+ resume_checkpoint: null
125
+ use_amp: true
126
+ warmup_model: true
127
+ warmup_ratio: 0.1
128
+ warmup_steps: 100
129
+ weight_decay: 0.1
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/wandb-metadata.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.8.0-117-generic-x86_64-with-glibc2.39",
3
+ "python": "CPython 3.12.0",
4
+ "startedAt": "2026-05-24T21:40:58.723386Z",
5
+ "program": "/workspace/byte-llms-code/code_completion_exp/train_hnet/train.py",
6
+ "codePath": "code_completion_exp/train_hnet/train.py",
7
+ "codePathLocal": "train.py",
8
+ "git": {
9
+ "remote": "https://github.com/naryst/byte-llms-code.git",
10
+ "commit": "b57d04cd9f885094fe832cedecaaa9b128098ca3"
11
+ },
12
+ "email": "nikita@local.ru",
13
+ "root": "outputs/2026-05-24/21-40-57",
14
+ "host": "1a023b226280",
15
+ "executable": "/venv/bytellm/bin/python",
16
+ "cpu_count": 36,
17
+ "cpu_count_logical": 72,
18
+ "gpu": "NVIDIA A100 80GB PCIe",
19
+ "gpu_count": 1,
20
+ "disk": {
21
+ "/": {
22
+ "total": "234075717632",
23
+ "used": "19460378624"
24
+ }
25
+ },
26
+ "memory": {
27
+ "total": "404274495488"
28
+ },
29
+ "gpu_nvidia": [
30
+ {
31
+ "name": "NVIDIA A100 80GB PCIe",
32
+ "memoryTotal": "85899345920",
33
+ "cudaCores": 6912,
34
+ "architecture": "Ampere",
35
+ "uuid": "GPU-ec9764c4-224a-9508-002f-de7762d78498"
36
+ }
37
+ ],
38
+ "cudaVersion": "13.2",
39
+ "writerId": "8tbm13mz63ggpgppnq4dj4hkvc961ndv"
40
+ }
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":41443},"train/hard_boundary_ratio_stage1":0.41428831398941984,"train/step_time":1.216536521911621,"_runtime":41443,"best/val_loss":0.2925644644940842,"train/chunk_len_stage0":2.823436495191354,"train/hard_boundary_ratio_stage0":0.35573476731596815,"train/loss_avg":0.15543903209881482,"train/lb_loss":1.1170897483825684,"val/perplexity":1.3941280151478466,"train/lm_loss":0.10374006628990173,"epoch/soft_boundary_ratio_stage0":0.3510054961482771,"train/soft_boundary_ratio_stage1":0.38348458492953624,"best/step":8000,"train/loss":0.2474285177886486,"_timestamp":1.7797002983859267e+09,"val/lm_loss":0.3314347568349802,"epoch/chunk_len_stage1":2.4207175499344795,"val/time":391.4486954212189,"epoch/hard_boundary_ratio_stage0":0.35573612602838667,"epoch/chunk_len_stage0":2.8234248996329034,"epoch/loss":0.15544442606447362,"epoch/lb_loss":1.0893232608403356,"epoch/time":13811.206544399261,"train/lr":1.000002524293968e-05,"epoch/hard_boundary_ratio_stage1":0.41428892502981024,"train/soft_boundary_ratio_stage0":0.35100414221000376,"epoch/soft_boundary_ratio_stage1":0.38348499507634093,"_step":29660,"train/epoch":3,"epoch/lm_loss":0.14452074673337414,"val/loss":0.34228874700274164,"val/lb_loss":1.0853990667925593,"train/chunk_len_stage1":2.4207217612011815,"best/val_perplexity":1.3263933377204589}
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-24T21:40:58.831451528Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpphm5sb3s/port-83034.txt","pid":83034,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-05-24T21:40:58.832303978Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":83034}
3
+ {"time":"2026-05-24T21:40:58.832293139Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-83034-83044-3980445079/socket","Net":"unix"}}
4
+ {"time":"2026-05-24T21:40:59.018472912Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-05-24T21:40:59.042712675Z","level":"INFO","msg":"handleInformInit: received","streamId":"3nxcy117","id":"1(@)"}
6
+ {"time":"2026-05-24T21:40:59.473935735Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"3nxcy117","id":"1(@)"}
7
+ {"time":"2026-05-25T09:11:44.387806371Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"3nxcy117","id":"1(@)"}
8
+ {"time":"2026-05-25T09:11:44.402875303Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"3nxcy117","id":"1(@)"}
9
+ {"time":"2026-05-25T09:11:44.423772595Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2026-05-25T09:11:44.423812263Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2026-05-25T09:11:44.423823672Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2026-05-25T09:11:44.423847378Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2026-05-25T09:11:44.423931379Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-83034-83044-3980445079/socket","Net":"unix"}}
14
+ {"time":"2026-05-25T09:11:44.423990926Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
15
+ {"time":"2026-05-25T09:11:44.424021181Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
16
+ {"time":"2026-05-25T09:11:44.424042817Z","level":"INFO","msg":"server is closed"}
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-05-24T21:40:59.042920084Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-05-24T21:40:59.47364371Z","level":"INFO","msg":"stream: created new stream","id":"3nxcy117"}
3
+ {"time":"2026-05-24T21:40:59.473752145Z","level":"INFO","msg":"handler: started","stream_id":"3nxcy117"}
4
+ {"time":"2026-05-24T21:40:59.473916706Z","level":"INFO","msg":"stream: started","id":"3nxcy117"}
5
+ {"time":"2026-05-24T21:40:59.473946539Z","level":"INFO","msg":"sender: started","stream_id":"3nxcy117"}
6
+ {"time":"2026-05-24T21:40:59.473947844Z","level":"INFO","msg":"writer: started","stream_id":"3nxcy117"}
7
+ {"time":"2026-05-24T21:40:59.608257445Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-05-25T09:11:44.25084313Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-05-25T09:11:44.382625783Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
+ {"time":"2026-05-25T09:11:44.387865173Z","level":"INFO","msg":"stream: closing","id":"3nxcy117"}
11
+ {"time":"2026-05-25T09:11:44.387891658Z","level":"INFO","msg":"handler: closed","stream_id":"3nxcy117"}
12
+ {"time":"2026-05-25T09:11:44.388086651Z","level":"INFO","msg":"sender: closed","stream_id":"3nxcy117"}
13
+ {"time":"2026-05-25T09:11:44.388103059Z","level":"INFO","msg":"stream: closed","id":"3nxcy117"}
hnet_train_rerun/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_setup.py:_flush():81] Configure stats pid to 83034
3
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:setup_run_log_directory():717] Logging user logs to outputs/2026-05-24/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug.log
5
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to outputs/2026-05-24/21-40-57/wandb/run-20260524_214058-3nxcy117/logs/debug-internal.log
6
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:init():844] calling init triggers
7
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'batch_size': 8, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0001, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 2000, 'save_every_epoch': False}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'hnet_rerun', 'run_name': 'hnet_train', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': 'outputs/2026-05-24/21-40-57'}, 'paths': {'output_dir': 'outputs/2026-05-24/21-40-57'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-05-24 21:40:58,725 INFO MainThread:83034 [wandb_init.py:init():892] starting backend
10
+ 2026-05-24 21:40:59,018 INFO MainThread:83034 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-05-24 21:40:59,039 INFO MainThread:83034 [wandb_init.py:init():903] backend started and connected
12
+ 2026-05-24 21:40:59,047 INFO MainThread:83034 [wandb_init.py:init():973] updated telemetry
13
+ 2026-05-24 21:40:59,077 INFO MainThread:83034 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-05-24 21:40:59,606 INFO MainThread:83034 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-05-24 21:40:59,860 INFO MainThread:83034 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-05-24 21:40:59,860 INFO MainThread:83034 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-05-24 21:40:59,860 INFO MainThread:83034 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-05-24 21:40:59,861 INFO MainThread:83034 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-05-24 21:40:59,865 INFO MainThread:83034 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-05-25 09:11:42,798 INFO MainThread:83034 [wandb_run.py:_finish():2295] finishing run nikita/hnet_rerun/3nxcy117
21
+ 2026-05-25 09:11:42,798 INFO MainThread:83034 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-05-25 09:11:42,799 INFO MainThread:83034 [wandb_run.py:_restore():2476] restore
23
+ 2026-05-25 09:11:42,799 INFO MainThread:83034 [wandb_run.py:_restore():2482] restore done
24
+ 2026-05-25 09:11:44,386 INFO MainThread:83034 [wandb_run.py:_footer_sync_info():3870] logging synced files