narySt commited on
Commit
75b6166
·
verified ·
1 Parent(s): c2b723f

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. lr_sweep/hnet_xl_code_lr_1e-4/model_final.pt +3 -0
  3. lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/code/code_completion_exp/train_hnet/train.py +284 -0
  4. lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/config.yaml +167 -0
  5. lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/output.log +0 -0
  6. lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/wandb-summary.json +1 -0
  7. lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/run-d5usyud5.wandb +3 -0
  8. lr_sweep/hnet_xl_code_lr_2e-4/.hydra/config.yaml +55 -0
  9. lr_sweep/hnet_xl_code_lr_2e-4/.hydra/hydra.yaml +166 -0
  10. lr_sweep/hnet_xl_code_lr_2e-4/.hydra/overrides.yaml +6 -0
  11. lr_sweep/hnet_xl_code_lr_2e-4/wandb/debug-internal.log +15 -0
  12. lr_sweep/hnet_xl_code_lr_2e-4/wandb/debug.log +24 -0
  13. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/code/code_completion_exp/train_hnet/train.py +284 -0
  14. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/output.log +76 -0
  15. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/requirements.txt +245 -0
  16. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/wandb-metadata.json +69 -0
  17. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug-core.log +7 -0
  18. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug-internal.log +7 -0
  19. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug.log +19 -0
  20. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/run-ln6tfunh.wandb +3 -0
  21. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/code/code_completion_exp/train_hnet/train.py +284 -0
  22. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/config.yaml +167 -0
  23. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/output.log +0 -0
  24. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/requirements.txt +245 -0
  25. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/wandb-metadata.json +1 -0
  26. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/wandb-summary.json +1 -0
  27. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug-core.log +16 -0
  28. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug-internal.log +15 -0
  29. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug.log +24 -0
  30. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb +3 -0
  31. lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb.synced +0 -0
  32. lr_sweep/hnet_xl_code_lr_5e-4/.hydra/config.yaml +55 -0
  33. lr_sweep/hnet_xl_code_lr_5e-4/.hydra/hydra.yaml +166 -0
  34. lr_sweep/hnet_xl_code_lr_5e-4/.hydra/overrides.yaml +6 -0
  35. lr_sweep/hnet_xl_code_lr_5e-4/model_final.pt +3 -0
  36. lr_sweep/hnet_xl_code_lr_5e-4/wandb/debug-internal.log +13 -0
  37. lr_sweep/hnet_xl_code_lr_5e-4/wandb/debug.log +24 -0
  38. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/code/code_completion_exp/train_hnet/train.py +284 -0
  39. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/config.yaml +151 -0
  40. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/output.log +0 -0
  41. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/requirements.txt +245 -0
  42. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/wandb-metadata.json +69 -0
  43. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/wandb-summary.json +1 -0
  44. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug-core.log +16 -0
  45. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug-internal.log +13 -0
  46. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug.log +24 -0
  47. lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/run-i6lt8av0.wandb +3 -0
  48. lr_sweep/hnet_xl_code_lr_5e-5/model_best.pt +3 -0
  49. lr_sweep/pythia_1b_lr_1e-4/.hydra/config.yaml +49 -0
  50. lr_sweep/pythia_1b_lr_1e-4/.hydra/hydra.yaml +167 -0
.gitattributes CHANGED
@@ -41,3 +41,8 @@ lr_sweep/hnet_xl_code_lr_5e-5/wandb/run-20260425_180603-5xd22ofy/run-5xd22ofy.wa
41
  lr_sweep/pythia_1b_lr_1e-5/wandb/run-20260425_180609-3z5g26qd/run-3z5g26qd.wandb filter=lfs diff=lfs merge=lfs -text
42
  lr_sweep/pythia_1b_lr_2e-5/wandb/run-20260425_184822-bhvwo83l/run-bhvwo83l.wandb filter=lfs diff=lfs merge=lfs -text
43
  lr_sweep/pythia_1b_lr_5e-5/wandb/run-20260425_193045-vg3if73m/run-vg3if73m.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
41
  lr_sweep/pythia_1b_lr_1e-5/wandb/run-20260425_180609-3z5g26qd/run-3z5g26qd.wandb filter=lfs diff=lfs merge=lfs -text
42
  lr_sweep/pythia_1b_lr_2e-5/wandb/run-20260425_184822-bhvwo83l/run-bhvwo83l.wandb filter=lfs diff=lfs merge=lfs -text
43
  lr_sweep/pythia_1b_lr_5e-5/wandb/run-20260425_193045-vg3if73m/run-vg3if73m.wandb filter=lfs diff=lfs merge=lfs -text
44
+ lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/run-i6lt8av0.wandb filter=lfs diff=lfs merge=lfs -text
45
+ lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/run-d5usyud5.wandb filter=lfs diff=lfs merge=lfs -text
46
+ lr_sweep/pythia_1b_lr_1e-4/wandb/run-20260425_201333-p8ozhgpm/run-p8ozhgpm.wandb filter=lfs diff=lfs merge=lfs -text
47
+ lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/run-ln6tfunh.wandb filter=lfs diff=lfs merge=lfs -text
48
+ lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb filter=lfs diff=lfs merge=lfs -text
lr_sweep/hnet_xl_code_lr_1e-4/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5507a5a2456bffdc961775096b62d8f9bebb9dd301fc350b5878213252b348e7
3
+ size 3315165484
lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/code/code_completion_exp/train_hnet/train.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для HNet модели на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=hnet_small
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ from pathlib import Path
33
+
34
+ import torch
35
+ import hydra
36
+ from hydra.core.hydra_config import HydraConfig
37
+ from omegaconf import DictConfig, OmegaConf
38
+ from accelerate import Accelerator
39
+ from accelerate.utils import set_seed as accelerate_set_seed
40
+
41
+ # HNet imports
42
+ from hnet.load_utils import load_from_pretrained, load_from_config
43
+ from hnet.utils.tokenizers import ByteTokenizer
44
+ from hnet.utils.train import group_params
45
+
46
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
47
+ import sys
48
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
49
+
50
+ # Shared training library
51
+ from training_lib.utils import log_message
52
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
53
+ from training_lib.schedulers import get_lr_scheduler
54
+ from training_lib.tracking import init_tracking, finish_tracking
55
+ from training_lib.hnet.train_loop import train_epoch
56
+ from training_lib.hnet.data import create_dataloaders
57
+
58
+
59
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
60
+ def main(cfg: DictConfig):
61
+ """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
62
+
63
+ # === Accelerator Setup ===
64
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
65
+
66
+ accelerator = Accelerator(
67
+ mixed_precision=mixed_precision,
68
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
69
+ )
70
+
71
+ # === Setup ===
72
+ accelerate_set_seed(cfg.seed)
73
+
74
+ if cfg.paths.output_dir is None:
75
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
76
+
77
+ OmegaConf.resolve(cfg)
78
+
79
+ log_message(
80
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
81
+ cfg,
82
+ accelerator,
83
+ )
84
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
85
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
86
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
87
+
88
+ log_message("=" * 60, cfg, accelerator)
89
+ log_message(
90
+ "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
91
+ )
92
+ log_message("=" * 60, cfg, accelerator)
93
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
94
+
95
+ # === Trackio Init ===
96
+ init_tracking(cfg, accelerator)
97
+
98
+ # === Tokenizer ===
99
+ log_message("Initializing tokenizer...", cfg, accelerator)
100
+ tokenizer = ByteTokenizer()
101
+
102
+ # === Model ===
103
+ log_message("Loading model...", cfg, accelerator)
104
+ if cfg.model.checkpoint_path:
105
+ model = load_from_pretrained(
106
+ model_path=cfg.model.checkpoint_path,
107
+ model_config_path=cfg.model.config_path,
108
+ )
109
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
110
+ else:
111
+ model = load_from_config(
112
+ model_config_path=cfg.model.config_path,
113
+ device="cpu",
114
+ )
115
+ model.init_weights()
116
+ log_message("Initialized from scratch", cfg, accelerator)
117
+
118
+ model.train()
119
+
120
+ # LR multiplier для разны�� стадий (до prepare!)
121
+ lr_multiplier = list(cfg.training.lr_multiplier)
122
+ model.apply_lr_multiplier(lr_multiplier)
123
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
124
+
125
+ # Warmup для Triton kernels
126
+ if cfg.training.warmup_model:
127
+ log_message("Warming up model...", cfg, accelerator)
128
+ model = model.to(accelerator.device)
129
+ model.warmup(verbose=accelerator.is_main_process)
130
+
131
+ # Log model info
132
+ total_params = sum(p.numel() for p in model.parameters())
133
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
134
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
135
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
136
+
137
+ # === Data ===
138
+ log_message("Creating dataloaders...", cfg, accelerator)
139
+ dataloaders = create_dataloaders(cfg, tokenizer)
140
+
141
+ train_dataloader = dataloaders["train"]
142
+ val_dataloader = dataloaders.get("validation", None)
143
+
144
+ log_message(
145
+ f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
146
+ )
147
+ log_message(
148
+ f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
149
+ cfg,
150
+ accelerator,
151
+ )
152
+
153
+ if val_dataloader:
154
+ log_message(
155
+ f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
156
+ )
157
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
158
+ else:
159
+ log_message("No validation dataset found", cfg, accelerator)
160
+
161
+ # === Optimizer ===
162
+ log_message("Creating optimizer...", cfg, accelerator)
163
+ param_groups = group_params(model)
164
+
165
+ for group in param_groups:
166
+ if "lr" not in group:
167
+ group["lr"] = cfg.training.lr
168
+ else:
169
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
170
+ if "weight_decay" not in group:
171
+ group["weight_decay"] = cfg.training.weight_decay
172
+
173
+ optimizer = torch.optim.AdamW(
174
+ param_groups,
175
+ lr=cfg.training.lr,
176
+ betas=tuple(cfg.training.betas),
177
+ eps=cfg.training.eps,
178
+ )
179
+
180
+ # === Scheduler ===
181
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
182
+ total_steps = (
183
+ cfg.training.epochs
184
+ * steps_per_epoch
185
+ // cfg.training.gradient_accumulation_steps
186
+ )
187
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
188
+
189
+ log_message(
190
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
191
+ cfg,
192
+ accelerator,
193
+ )
194
+
195
+ # === Accelerate Prepare ===
196
+ log_message(
197
+ "Preparing model, optimizer, and dataloaders with Accelerate...",
198
+ cfg,
199
+ accelerator,
200
+ )
201
+
202
+ if val_dataloader is not None:
203
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
204
+ accelerator.prepare(
205
+ model, optimizer, train_dataloader, val_dataloader, scheduler
206
+ )
207
+ )
208
+ else:
209
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
210
+ model, optimizer, train_dataloader, scheduler
211
+ )
212
+
213
+ log_message(
214
+ f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
215
+ cfg,
216
+ accelerator,
217
+ )
218
+
219
+ # === Resume ===
220
+ global_step = 0
221
+ start_epoch = 1
222
+
223
+ if cfg.training.resume and cfg.training.resume_checkpoint:
224
+ global_step, start_epoch = load_checkpoint(
225
+ model,
226
+ optimizer,
227
+ scheduler,
228
+ cfg.training.resume_checkpoint,
229
+ cfg,
230
+ accelerator,
231
+ )
232
+ start_epoch += 1
233
+
234
+ # === Training Loop ===
235
+ log_message("Starting training...", cfg, accelerator)
236
+
237
+ best_val_loss = float("inf")
238
+
239
+ try:
240
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
241
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
242
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
243
+ log_message(f"{'=' * 60}", cfg, accelerator)
244
+
245
+ global_step, best_val_loss = train_epoch(
246
+ model=model,
247
+ dataloader=train_dataloader,
248
+ optimizer=optimizer,
249
+ scheduler=scheduler,
250
+ cfg=cfg,
251
+ epoch=epoch,
252
+ global_step=global_step,
253
+ accelerator=accelerator,
254
+ val_dataloader=val_dataloader,
255
+ best_val_loss=best_val_loss,
256
+ )
257
+
258
+ if cfg.logging.save_every_epoch:
259
+ save_checkpoint(
260
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
261
+ )
262
+
263
+ except KeyboardInterrupt:
264
+ log_message("Training interrupted by user", cfg, accelerator)
265
+ save_checkpoint(
266
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
267
+ )
268
+
269
+ # === Final Save ===
270
+ log_message("\nTraining completed!", cfg, accelerator)
271
+
272
+ if accelerator.is_main_process:
273
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
274
+ unwrapped_model = accelerator.unwrap_model(model)
275
+ torch.save(unwrapped_model.state_dict(), final_model_path)
276
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
277
+
278
+ accelerator.wait_for_everyone()
279
+ accelerator.end_training()
280
+ finish_tracking()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/config.yaml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ code_path: code/code_completion_exp/train_hnet/train.py
7
+ python_version: 3.12.0
8
+ cli_version: 0.24.0
9
+ framework: huggingface
10
+ huggingface_version: 4.57.6
11
+ is_jupyter_run: false
12
+ is_kaggle_kernel: false
13
+ start_time: 1777147642
14
+ t:
15
+ 1:
16
+ - 1
17
+ - 11
18
+ - 49
19
+ - 50
20
+ - 51
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 50
28
+ - 51
29
+ - 71
30
+ - 105
31
+ 3:
32
+ - 2
33
+ - 13
34
+ - 16
35
+ - 37
36
+ - 42
37
+ - 61
38
+ 4: 3.12.0
39
+ 5: 0.24.0
40
+ 6: 4.57.6
41
+ 13: linux-x86_64
42
+ e:
43
+ yd4im4gytbm7o9yud168kac4xfyaj2kg:
44
+ os: Linux-5.4.0-176-generic-x86_64-with-glibc2.35
45
+ python: CPython 3.12.0
46
+ started_at: '2026-04-25T20:07:22.253382Z'
47
+ args:
48
+ - tracking=wandb
49
+ - tracking.project=code-completion_lr-sweep
50
+ - tracking.run_name=hnet_xl_code_lr_1e-4
51
+ - training.lr=1e-4
52
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_1e-4
53
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
54
+ program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
55
+ code_path: code_completion_exp/train_hnet/train.py
56
+ code_path_local: train.py
57
+ git:
58
+ remote_url: https://github.com/naryst/byte-llms-code.git
59
+ commit: f111e13281aa0dc58e24302edab5b0d5c2024586
60
+ email: nikita@local.ru
61
+ root: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_1e-4
62
+ host: 7504e518d24a
63
+ executable: /venv/bytellm/bin/python
64
+ cpu_count: 64
65
+ cpu_count_logical: 128
66
+ gpu_type: NVIDIA H100 80GB HBM3
67
+ gpu_count: 4
68
+ disk:
69
+ /:
70
+ total: '265214230528'
71
+ used: '104071081984'
72
+ memory:
73
+ total: '1081679683584'
74
+ gpu_nvidia:
75
+ - name: NVIDIA H100 80GB HBM3
76
+ memory_total: '85520809984'
77
+ cuda_cores: 16896
78
+ architecture: Hopper
79
+ uuid: GPU-b60cdcab-2033-2009-41de-be646c953a20
80
+ - name: NVIDIA H100 80GB HBM3
81
+ memory_total: '85520809984'
82
+ cuda_cores: 16896
83
+ architecture: Hopper
84
+ uuid: GPU-9982b420-4520-4238-c378-ec5a46015474
85
+ - name: NVIDIA H100 80GB HBM3
86
+ memory_total: '85520809984'
87
+ cuda_cores: 16896
88
+ architecture: Hopper
89
+ uuid: GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f
90
+ - name: NVIDIA H100 80GB HBM3
91
+ memory_total: '85520809984'
92
+ cuda_cores: 16896
93
+ architecture: Hopper
94
+ uuid: GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134
95
+ cuda_version: '12.2'
96
+ writer_id: yd4im4gytbm7o9yud168kac4xfyaj2kg
97
+ model:
98
+ desc: null
99
+ value:
100
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
101
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
102
+ training:
103
+ desc: null
104
+ value:
105
+ epochs: 1
106
+ batch_size: 4
107
+ eval_batch_size: 24
108
+ gradient_accumulation_steps: 4
109
+ lr: 0.0001
110
+ weight_decay: 0.1
111
+ betas:
112
+ - 0.9
113
+ - 0.95
114
+ eps: 1.0e-08
115
+ lr_scheduler: wsd
116
+ warmup_ratio: 0.1
117
+ decay_ratio: 0.2
118
+ warmup_steps: 100
119
+ min_lr_ratio: 0.1
120
+ lr_multiplier:
121
+ - 2.0
122
+ - 1.5
123
+ - 1.0
124
+ load_balancing_weight: 0.01
125
+ load_balancing_N: 4.0
126
+ max_grad_norm: 1.0
127
+ use_amp: true
128
+ resume: false
129
+ resume_checkpoint: null
130
+ warmup_model: true
131
+ data:
132
+ desc: null
133
+ value:
134
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
135
+ max_context_len: 4096
136
+ max_target_len: 256
137
+ num_workers: 0
138
+ pin_memory: true
139
+ max_train_samples: null
140
+ max_val_samples: 2000
141
+ logging:
142
+ desc: null
143
+ value:
144
+ log_interval: 10
145
+ save_interval: 0
146
+ eval_interval: 2000
147
+ save_every_epoch: false
148
+ tracking:
149
+ desc: null
150
+ value:
151
+ enabled: true
152
+ backend: wandb
153
+ project: code-completion_lr-sweep
154
+ run_name: hnet_xl_code_lr_1e-4
155
+ entity: null
156
+ base_url: https://wandb.platun0v.ru
157
+ local_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_1e-4
158
+ paths:
159
+ desc: null
160
+ value:
161
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_1e-4
162
+ seed:
163
+ desc: null
164
+ value: 42
165
+ device:
166
+ desc: null
167
+ value: cuda
lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime": 7158, "train/epoch": 1, "train/step_time": 0.7092468976974488, "train/chunk_len_stage0": 2.8179939558054445, "train/chunk_len_stage1": 2.354251368177087, "_step": 9880, "_timestamp": 1777154798.6939964, "train/loss": 0.19837140291929245, "train/lm_loss": 0.22973915934562683, "train/lb_loss": 1.1289136409759521, "train/hard_boundary_ratio_stage0": 0.35757633658743543, "train/hard_boundary_ratio_stage1": 0.42741741229545455, "train/soft_boundary_ratio_stage1": 0.40072747577085055, "train/loss_avg": 0.29260910248965016, "train/lr": 1e-05, "train/soft_boundary_ratio_stage0": 0.3544859938887565, "val/perplexity": 1.321679014796039, "val/lm_loss": 0.2812648575220789, "val/lb_loss": 1.0910536590076627, "best/step": 8000, "val/time": 4.739992141723633, "best/val_perplexity": 1.321679014796039, "val/loss": 0.29217539443856194, "best/val_loss": 0.29217539443856194, "epoch/chunk_len_stage1": 2.3543435574383285, "epoch/soft_boundary_ratio_stage0": 0.35449292086488965, "epoch/soft_boundary_ratio_stage1": 0.4007127851565031, "epoch/lb_loss": 1.1061990606274803, "epoch/lm_loss": 0.28240673256245047, "epoch/chunk_len_stage0": 2.8179205942465053, "epoch/hard_boundary_ratio_stage0": 0.35758569070574475, "epoch/loss": 0.29261703164552394, "epoch/time": 7105.889644384384, "epoch/hard_boundary_ratio_stage1": 0.42740058972624}
lr_sweep/hnet_xl_code_lr_1e-4/wandb/run-20260425_200722-d5usyud5/run-d5usyud5.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f8b8e6b5350654f7b0216247b78604c928fa1b5d7dab31aa40755bb6c3f5e02
3
+ size 3080041
lr_sweep/hnet_xl_code_lr_2e-4/.hydra/config.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 1
6
+ batch_size: 4
7
+ eval_batch_size: 24
8
+ gradient_accumulation_steps: 4
9
+ lr: 0.0002
10
+ weight_decay: 0.1
11
+ betas:
12
+ - 0.9
13
+ - 0.95
14
+ eps: 1.0e-08
15
+ lr_scheduler: wsd
16
+ warmup_ratio: 0.1
17
+ decay_ratio: 0.2
18
+ warmup_steps: 100
19
+ min_lr_ratio: 0.1
20
+ lr_multiplier:
21
+ - 2.0
22
+ - 1.5
23
+ - 1.0
24
+ load_balancing_weight: 0.01
25
+ load_balancing_N: 4.0
26
+ max_grad_norm: 1.0
27
+ use_amp: true
28
+ resume: false
29
+ resume_checkpoint: null
30
+ warmup_model: true
31
+ data:
32
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
33
+ max_context_len: 4096
34
+ max_target_len: 256
35
+ num_workers: 0
36
+ pin_memory: true
37
+ max_train_samples: null
38
+ max_val_samples: 2000
39
+ logging:
40
+ log_interval: 10
41
+ save_interval: 0
42
+ eval_interval: 2000
43
+ save_every_epoch: false
44
+ tracking:
45
+ enabled: true
46
+ backend: wandb
47
+ project: code-completion_lr-sweep
48
+ run_name: hnet_xl_code_lr_2e-4
49
+ entity: null
50
+ base_url: https://wandb.platun0v.ru
51
+ local_dir: ${paths.output_dir}
52
+ paths:
53
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
54
+ seed: 42
55
+ device: cuda
lr_sweep/hnet_xl_code_lr_2e-4/.hydra/hydra.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - tracking=wandb
116
+ - tracking.project=code-completion_lr-sweep
117
+ - tracking.run_name=hnet_xl_code_lr_2e-4
118
+ - training.lr=2e-4
119
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
120
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
121
+ job:
122
+ name: train
123
+ chdir: false
124
+ override_dirname: data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full,paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4,tracking.project=code-completion_lr-sweep,tracking.run_name=hnet_xl_code_lr_2e-4,tracking=wandb,training.lr=2e-4
125
+ id: ???
126
+ num: ???
127
+ config_name: config
128
+ env_set: {}
129
+ env_copy: []
130
+ config:
131
+ override_dirname:
132
+ kv_sep: '='
133
+ item_sep: ','
134
+ exclude_keys: []
135
+ runtime:
136
+ version: 1.3.2
137
+ version_base: '1.3'
138
+ cwd: /workspace/byte-llms-code/code_completion_exp/train_hnet
139
+ config_sources:
140
+ - path: hydra.conf
141
+ schema: pkg
142
+ provider: hydra
143
+ - path: /workspace/byte-llms-code/code_completion_exp/train_hnet/configs
144
+ schema: file
145
+ provider: main
146
+ - path: ''
147
+ schema: structured
148
+ provider: schema
149
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
150
+ choices:
151
+ paths: default
152
+ tracking: wandb
153
+ logging: default
154
+ data: default
155
+ training: default
156
+ model: hnet_xl_code
157
+ hydra/env: default
158
+ hydra/callbacks: null
159
+ hydra/job_logging: default
160
+ hydra/hydra_logging: default
161
+ hydra/hydra_help: default
162
+ hydra/help: default
163
+ hydra/sweeper: basic
164
+ hydra/launcher: basic
165
+ hydra/output: default
166
+ verbose: false
lr_sweep/hnet_xl_code_lr_2e-4/.hydra/overrides.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ - tracking=wandb
2
+ - tracking.project=code-completion_lr-sweep
3
+ - tracking.run_name=hnet_xl_code_lr_2e-4
4
+ - training.lr=2e-4
5
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
6
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
lr_sweep/hnet_xl_code_lr_2e-4/wandb/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T22:20:12.017648642Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-25T22:20:12.401332467Z","level":"INFO","msg":"stream: created new stream","id":"khn25dwv"}
3
+ {"time":"2026-04-25T22:20:12.401402455Z","level":"INFO","msg":"handler: started","stream_id":"khn25dwv"}
4
+ {"time":"2026-04-25T22:20:12.401496869Z","level":"INFO","msg":"stream: started","id":"khn25dwv"}
5
+ {"time":"2026-04-25T22:20:12.401507601Z","level":"INFO","msg":"writer: started","stream_id":"khn25dwv"}
6
+ {"time":"2026-04-25T22:20:12.40151694Z","level":"INFO","msg":"sender: started","stream_id":"khn25dwv"}
7
+ {"time":"2026-04-25T22:20:12.5289538Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-04-25T23:20:22.74400376Z","level":"ERROR","msg":"api: HTTP error","status":403,"method":"POST","url":"https://wandb.platun0v.ru/files/nikita/code-completion_lr-sweep/khn25dwv/file_stream"}
9
+ {"time":"2026-04-25T23:20:22.744078123Z","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 403 Forbidden url=https://wandb.platun0v.ru/files/nikita/code-completion_lr-sweep/khn25dwv/file_stream: "}
10
+ {"time":"2026-04-26T00:19:30.370077692Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2026-04-26T00:19:30.370863672Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2026-04-26T00:19:30.373711643Z","level":"INFO","msg":"stream: closing","id":"khn25dwv"}
13
+ {"time":"2026-04-26T00:19:30.373733071Z","level":"INFO","msg":"handler: closed","stream_id":"khn25dwv"}
14
+ {"time":"2026-04-26T00:19:30.37382892Z","level":"INFO","msg":"sender: closed","stream_id":"khn25dwv"}
15
+ {"time":"2026-04-26T00:19:30.373836548Z","level":"INFO","msg":"stream: closed","id":"khn25dwv"}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_setup.py:_flush():81] Configure stats pid to 198705
3
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug.log
5
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug-internal.log
6
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 1, 'batch_size': 4, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0002, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': 2000}, 'logging': {'log_interval': 10, 'save_interval': 0, 'eval_interval': 2000, 'save_every_epoch': False}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_lr-sweep', 'run_name': 'hnet_xl_code_lr_2e-4', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4'}, 'paths': {'output_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:init():892] starting backend
10
+ 2026-04-25 22:20:11,992 INFO MainThread:198705 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-25 22:20:12,016 INFO MainThread:198705 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-25 22:20:12,019 INFO MainThread:198705 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-25 22:20:12,040 INFO MainThread:198705 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-25 22:20:12,528 INFO MainThread:198705 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-25 22:20:12,685 INFO MainThread:198705 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-25 22:20:12,685 INFO MainThread:198705 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-25 22:20:12,685 INFO MainThread:198705 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-25 22:20:12,686 INFO MainThread:198705 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-25 22:20:12,688 INFO MainThread:198705 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-04-26 00:19:29,581 INFO MainThread:198705 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_lr-sweep/khn25dwv
21
+ 2026-04-26 00:19:29,582 INFO MainThread:198705 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-04-26 00:19:29,582 INFO MainThread:198705 [wandb_run.py:_restore():2476] restore
23
+ 2026-04-26 00:19:29,582 INFO MainThread:198705 [wandb_run.py:_restore():2482] restore done
24
+ 2026-04-26 00:19:30,373 INFO MainThread:198705 [wandb_run.py:_footer_sync_info():3870] logging synced files
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/code/code_completion_exp/train_hnet/train.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для HNet модели на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=hnet_small
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ from pathlib import Path
33
+
34
+ import torch
35
+ import hydra
36
+ from hydra.core.hydra_config import HydraConfig
37
+ from omegaconf import DictConfig, OmegaConf
38
+ from accelerate import Accelerator
39
+ from accelerate.utils import set_seed as accelerate_set_seed
40
+
41
+ # HNet imports
42
+ from hnet.load_utils import load_from_pretrained, load_from_config
43
+ from hnet.utils.tokenizers import ByteTokenizer
44
+ from hnet.utils.train import group_params
45
+
46
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
47
+ import sys
48
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
49
+
50
+ # Shared training library
51
+ from training_lib.utils import log_message
52
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
53
+ from training_lib.schedulers import get_lr_scheduler
54
+ from training_lib.tracking import init_tracking, finish_tracking
55
+ from training_lib.hnet.train_loop import train_epoch
56
+ from training_lib.hnet.data import create_dataloaders
57
+
58
+
59
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
60
+ def main(cfg: DictConfig):
61
+ """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
62
+
63
+ # === Accelerator Setup ===
64
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
65
+
66
+ accelerator = Accelerator(
67
+ mixed_precision=mixed_precision,
68
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
69
+ )
70
+
71
+ # === Setup ===
72
+ accelerate_set_seed(cfg.seed)
73
+
74
+ if cfg.paths.output_dir is None:
75
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
76
+
77
+ OmegaConf.resolve(cfg)
78
+
79
+ log_message(
80
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
81
+ cfg,
82
+ accelerator,
83
+ )
84
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
85
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
86
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
87
+
88
+ log_message("=" * 60, cfg, accelerator)
89
+ log_message(
90
+ "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
91
+ )
92
+ log_message("=" * 60, cfg, accelerator)
93
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
94
+
95
+ # === Trackio Init ===
96
+ init_tracking(cfg, accelerator)
97
+
98
+ # === Tokenizer ===
99
+ log_message("Initializing tokenizer...", cfg, accelerator)
100
+ tokenizer = ByteTokenizer()
101
+
102
+ # === Model ===
103
+ log_message("Loading model...", cfg, accelerator)
104
+ if cfg.model.checkpoint_path:
105
+ model = load_from_pretrained(
106
+ model_path=cfg.model.checkpoint_path,
107
+ model_config_path=cfg.model.config_path,
108
+ )
109
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
110
+ else:
111
+ model = load_from_config(
112
+ model_config_path=cfg.model.config_path,
113
+ device="cpu",
114
+ )
115
+ model.init_weights()
116
+ log_message("Initialized from scratch", cfg, accelerator)
117
+
118
+ model.train()
119
+
120
+ # LR multiplier для разны�� стадий (до prepare!)
121
+ lr_multiplier = list(cfg.training.lr_multiplier)
122
+ model.apply_lr_multiplier(lr_multiplier)
123
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
124
+
125
+ # Warmup для Triton kernels
126
+ if cfg.training.warmup_model:
127
+ log_message("Warming up model...", cfg, accelerator)
128
+ model = model.to(accelerator.device)
129
+ model.warmup(verbose=accelerator.is_main_process)
130
+
131
+ # Log model info
132
+ total_params = sum(p.numel() for p in model.parameters())
133
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
134
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
135
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
136
+
137
+ # === Data ===
138
+ log_message("Creating dataloaders...", cfg, accelerator)
139
+ dataloaders = create_dataloaders(cfg, tokenizer)
140
+
141
+ train_dataloader = dataloaders["train"]
142
+ val_dataloader = dataloaders.get("validation", None)
143
+
144
+ log_message(
145
+ f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
146
+ )
147
+ log_message(
148
+ f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
149
+ cfg,
150
+ accelerator,
151
+ )
152
+
153
+ if val_dataloader:
154
+ log_message(
155
+ f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
156
+ )
157
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
158
+ else:
159
+ log_message("No validation dataset found", cfg, accelerator)
160
+
161
+ # === Optimizer ===
162
+ log_message("Creating optimizer...", cfg, accelerator)
163
+ param_groups = group_params(model)
164
+
165
+ for group in param_groups:
166
+ if "lr" not in group:
167
+ group["lr"] = cfg.training.lr
168
+ else:
169
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
170
+ if "weight_decay" not in group:
171
+ group["weight_decay"] = cfg.training.weight_decay
172
+
173
+ optimizer = torch.optim.AdamW(
174
+ param_groups,
175
+ lr=cfg.training.lr,
176
+ betas=tuple(cfg.training.betas),
177
+ eps=cfg.training.eps,
178
+ )
179
+
180
+ # === Scheduler ===
181
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
182
+ total_steps = (
183
+ cfg.training.epochs
184
+ * steps_per_epoch
185
+ // cfg.training.gradient_accumulation_steps
186
+ )
187
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
188
+
189
+ log_message(
190
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
191
+ cfg,
192
+ accelerator,
193
+ )
194
+
195
+ # === Accelerate Prepare ===
196
+ log_message(
197
+ "Preparing model, optimizer, and dataloaders with Accelerate...",
198
+ cfg,
199
+ accelerator,
200
+ )
201
+
202
+ if val_dataloader is not None:
203
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
204
+ accelerator.prepare(
205
+ model, optimizer, train_dataloader, val_dataloader, scheduler
206
+ )
207
+ )
208
+ else:
209
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
210
+ model, optimizer, train_dataloader, scheduler
211
+ )
212
+
213
+ log_message(
214
+ f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
215
+ cfg,
216
+ accelerator,
217
+ )
218
+
219
+ # === Resume ===
220
+ global_step = 0
221
+ start_epoch = 1
222
+
223
+ if cfg.training.resume and cfg.training.resume_checkpoint:
224
+ global_step, start_epoch = load_checkpoint(
225
+ model,
226
+ optimizer,
227
+ scheduler,
228
+ cfg.training.resume_checkpoint,
229
+ cfg,
230
+ accelerator,
231
+ )
232
+ start_epoch += 1
233
+
234
+ # === Training Loop ===
235
+ log_message("Starting training...", cfg, accelerator)
236
+
237
+ best_val_loss = float("inf")
238
+
239
+ try:
240
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
241
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
242
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
243
+ log_message(f"{'=' * 60}", cfg, accelerator)
244
+
245
+ global_step, best_val_loss = train_epoch(
246
+ model=model,
247
+ dataloader=train_dataloader,
248
+ optimizer=optimizer,
249
+ scheduler=scheduler,
250
+ cfg=cfg,
251
+ epoch=epoch,
252
+ global_step=global_step,
253
+ accelerator=accelerator,
254
+ val_dataloader=val_dataloader,
255
+ best_val_loss=best_val_loss,
256
+ )
257
+
258
+ if cfg.logging.save_every_epoch:
259
+ save_checkpoint(
260
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
261
+ )
262
+
263
+ except KeyboardInterrupt:
264
+ log_message("Training interrupted by user", cfg, accelerator)
265
+ save_checkpoint(
266
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
267
+ )
268
+
269
+ # === Final Save ===
270
+ log_message("\nTraining completed!", cfg, accelerator)
271
+
272
+ if accelerator.is_main_process:
273
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
274
+ unwrapped_model = accelerator.unwrap_model(model)
275
+ torch.save(unwrapped_model.state_dict(), final_model_path)
276
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
277
+
278
+ accelerator.wait_for_everyone()
279
+ accelerator.end_training()
280
+ finish_tracking()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/output.log ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-04-25 22:06:54] Initializing tokenizer...
2
+ [2026-04-25 22:06:54] Loading model...
3
+ [2026-04-25 22:06:58] Loaded pretrained: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ [2026-04-25 22:06:58] Applied LR multipliers: [2.0, 1.5, 1.0]
5
+ [2026-04-25 22:06:58] Warming up model...
6
+ [WARMUP] Starting warmup (compiling Triton kernels)...
7
+ [WARMUP] Forward: 17.363s, Backward: 26.562s
8
+ [WARMUP] Warmup complete. Subsequent passes will be fast.
9
+ [2026-04-25 22:07:42] Total params: 1,654,090,112
10
+ [2026-04-25 22:07:42] Trainable params: 1,654,090,112
11
+ [2026-04-25 22:07:42] Creating dataloaders...
12
+ [2026-04-25 22:07:42] Train dataset size: 316397
13
+ [2026-04-25 22:07:42] Train batches per epoch (before DDP split): 79100
14
+ [2026-04-25 22:07:42] Validation dataset size: 35098
15
+ [2026-04-25 22:07:42] Validation batches: 1463
16
+ [2026-04-25 22:07:42] Creating optimizer...
17
+ [2026-04-25 22:07:42] Total steps: 29662, Steps per epoch: 39550
18
+ [2026-04-25 22:07:42] Preparing model, optimizer, and dataloaders with Accelerate...
19
+ [2026-04-25 22:07:43] Train batches per epoch (after DDP split): 39550
20
+ [2026-04-25 22:07:43] Starting training...
21
+ [2026-04-25 22:07:43]
22
+ ============================================================
23
+ [2026-04-25 22:07:43] EPOCH 1/3
24
+ [2026-04-25 22:07:43] ============================================================
25
+ [2026-04-25 22:08:13] Epoch 1 | Step 10 | Loss: 0.6143 | LM: 0.5857 | LB: 1.1576 | CL0: 2.9 | CL1: 2.1 | HR0: 0.352/SR0: 0.351 | HR1: 0.476/SR1: 0.455 | LR: 2.12e-05
26
+ [2026-04-25 22:08:20] Epoch 1 | Step 20 | Loss: 0.5841 | LM: 0.5756 | LB: 1.1555 | CL0: 2.9 | CL1: 2.1 | HR0: 0.352/SR0: 0.351 | HR1: 0.475/SR1: 0.455 | LR: 2.24e-05
27
+ [2026-04-25 22:08:27] Epoch 1 | Step 30 | Loss: 0.5401 | LM: 0.5225 | LB: 1.1531 | CL0: 2.9 | CL1: 2.1 | HR0: 0.352/SR0: 0.351 | HR1: 0.474/SR1: 0.453 | LR: 2.36e-05
28
+ [2026-04-25 22:08:34] Epoch 1 | Step 40 | Loss: 0.5129 | LM: 0.4917 | LB: 1.1617 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.479/SR1: 0.457 | LR: 2.49e-05
29
+ [2026-04-25 22:08:41] Epoch 1 | Step 50 | Loss: 0.4778 | LM: 0.4483 | LB: 1.1610 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.355 | HR1: 0.477/SR1: 0.457 | LR: 2.61e-05
30
+ [2026-04-25 22:08:48] Epoch 1 | Step 60 | Loss: 0.4532 | LM: 0.4247 | LB: 1.1636 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.480/SR1: 0.460 | LR: 2.73e-05
31
+ [2026-04-25 22:08:56] Epoch 1 | Step 70 | Loss: 0.4372 | LM: 0.4149 | LB: 1.1636 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.480/SR1: 0.459 | LR: 2.85e-05
32
+ [2026-04-25 22:09:03] Epoch 1 | Step 80 | Loss: 0.4289 | LM: 0.4151 | LB: 1.1651 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.481/SR1: 0.460 | LR: 2.97e-05
33
+ [2026-04-25 22:09:10] Epoch 1 | Step 90 | Loss: 0.4225 | LM: 0.4074 | LB: 1.1657 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.481/SR1: 0.460 | LR: 3.09e-05
34
+ [2026-04-25 22:09:17] Epoch 1 | Step 100 | Loss: 0.4147 | LM: 0.4106 | LB: 1.1654 | CL0: 2.8 | CL1: 2.1 | HR0: 0.360/SR0: 0.357 | HR1: 0.480/SR1: 0.459 | LR: 3.21e-05
35
+ [2026-04-25 22:09:24] Epoch 1 | Step 110 | Loss: 0.4128 | LM: 0.4158 | LB: 1.1662 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.356 | HR1: 0.482/SR1: 0.460 | LR: 3.34e-05
36
+ [2026-04-25 22:09:31] Epoch 1 | Step 120 | Loss: 0.4090 | LM: 0.4045 | LB: 1.1655 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.357 | HR1: 0.481/SR1: 0.460 | LR: 3.46e-05
37
+ [2026-04-25 22:09:38] Epoch 1 | Step 130 | Loss: 0.4051 | LM: 0.4000 | LB: 1.1654 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.482/SR1: 0.460 | LR: 3.58e-05
38
+ [2026-04-25 22:09:46] Epoch 1 | Step 140 | Loss: 0.4012 | LM: 0.4007 | LB: 1.1660 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.482/SR1: 0.460 | LR: 3.70e-05
39
+ [2026-04-25 22:09:53] Epoch 1 | Step 150 | Loss: 0.3946 | LM: 0.3875 | LB: 1.1656 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.357 | HR1: 0.481/SR1: 0.460 | LR: 3.82e-05
40
+ [2026-04-25 22:10:00] Epoch 1 | Step 160 | Loss: 0.3897 | LM: 0.3822 | LB: 1.1664 | CL0: 2.8 | CL1: 2.1 | HR0: 0.359/SR0: 0.357 | HR1: 0.482/SR1: 0.460 | LR: 3.94e-05
41
+ [2026-04-25 22:10:07] Epoch 1 | Step 170 | Loss: 0.3894 | LM: 0.3884 | LB: 1.1641 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.480/SR1: 0.459 | LR: 4.06e-05
42
+ [2026-04-25 22:10:14] Epoch 1 | Step 180 | Loss: 0.3840 | LM: 0.3834 | LB: 1.1633 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.458 | LR: 4.18e-05
43
+ [2026-04-25 22:10:22] Epoch 1 | Step 190 | Loss: 0.3823 | LM: 0.3808 | LB: 1.1622 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.457 | LR: 4.31e-05
44
+ [2026-04-25 22:10:29] Epoch 1 | Step 200 | Loss: 0.3800 | LM: 0.3834 | LB: 1.1615 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.457 | LR: 4.43e-05
45
+ [2026-04-25 22:10:36] Epoch 1 | Step 210 | Loss: 0.3794 | LM: 0.3856 | LB: 1.1616 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.479/SR1: 0.457 | LR: 4.55e-05
46
+ [2026-04-25 22:10:43] Epoch 1 | Step 220 | Loss: 0.3768 | LM: 0.3858 | LB: 1.1622 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.479/SR1: 0.458 | LR: 4.67e-05
47
+ [2026-04-25 22:10:50] Epoch 1 | Step 230 | Loss: 0.3727 | LM: 0.3789 | LB: 1.1626 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.480/SR1: 0.458 | LR: 4.79e-05
48
+ [2026-04-25 22:10:57] Epoch 1 | Step 240 | Loss: 0.3705 | LM: 0.3740 | LB: 1.1623 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.480/SR1: 0.458 | LR: 4.91e-05
49
+ [2026-04-25 22:11:05] Epoch 1 | Step 250 | Loss: 0.3683 | LM: 0.3685 | LB: 1.1624 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.355 | HR1: 0.480/SR1: 0.458 | LR: 5.03e-05
50
+ [2026-04-25 22:11:12] Epoch 1 | Step 260 | Loss: 0.3676 | LM: 0.3706 | LB: 1.1626 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.354 | HR1: 0.481/SR1: 0.459 | LR: 5.16e-05
51
+ [2026-04-25 22:11:19] Epoch 1 | Step 270 | Loss: 0.3649 | LM: 0.3704 | LB: 1.1618 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.354 | HR1: 0.480/SR1: 0.458 | LR: 5.28e-05
52
+ [2026-04-25 22:11:26] Epoch 1 | Step 280 | Loss: 0.3621 | LM: 0.3716 | LB: 1.1620 | CL0: 2.8 | CL1: 2.1 | HR0: 0.356/SR0: 0.354 | HR1: 0.480/SR1: 0.458 | LR: 5.40e-05
53
+ [2026-04-25 22:11:33] Epoch 1 | Step 290 | Loss: 0.3600 | LM: 0.3707 | LB: 1.1622 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.480/SR1: 0.458 | LR: 5.52e-05
54
+ [2026-04-25 22:11:40] Epoch 1 | Step 300 | Loss: 0.3591 | LM: 0.3701 | LB: 1.1627 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.480/SR1: 0.458 | LR: 5.64e-05
55
+ [2026-04-25 22:11:47] Epoch 1 | Step 310 | Loss: 0.3576 | LM: 0.3678 | LB: 1.1622 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.480/SR1: 0.457 | LR: 5.76e-05
56
+ [2026-04-25 22:11:55] Epoch 1 | Step 320 | Loss: 0.3549 | LM: 0.3649 | LB: 1.1624 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.480/SR1: 0.457 | LR: 5.88e-05
57
+ [2026-04-25 22:12:02] Epoch 1 | Step 330 | Loss: 0.3530 | LM: 0.3634 | LB: 1.1624 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.480/SR1: 0.457 | LR: 6.01e-05
58
+ [2026-04-25 22:12:09] Epoch 1 | Step 340 | Loss: 0.3509 | LM: 0.3612 | LB: 1.1625 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.480/SR1: 0.457 | LR: 6.13e-05
59
+ [2026-04-25 22:12:16] Epoch 1 | Step 350 | Loss: 0.3499 | LM: 0.3601 | LB: 1.1627 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.480/SR1: 0.458 | LR: 6.25e-05
60
+ [2026-04-25 22:12:23] Epoch 1 | Step 360 | Loss: 0.3473 | LM: 0.3569 | LB: 1.1623 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.480/SR1: 0.457 | LR: 6.37e-05
61
+ [2026-04-25 22:12:30] Epoch 1 | Step 370 | Loss: 0.3450 | LM: 0.3553 | LB: 1.1618 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.457 | LR: 6.49e-05
62
+ [2026-04-25 22:12:37] Epoch 1 | Step 380 | Loss: 0.3431 | LM: 0.3544 | LB: 1.1617 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.457 | LR: 6.61e-05
63
+ [2026-04-25 22:12:45] Epoch 1 | Step 390 | Loss: 0.3420 | LM: 0.3526 | LB: 1.1621 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.457 | LR: 6.73e-05
64
+ [2026-04-25 22:12:52] Epoch 1 | Step 400 | Loss: 0.3414 | LM: 0.3520 | LB: 1.1614 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.456 | LR: 6.86e-05
65
+ [2026-04-25 22:12:59] Epoch 1 | Step 410 | Loss: 0.3401 | LM: 0.3517 | LB: 1.1612 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.479/SR1: 0.456 | LR: 6.98e-05
66
+ [2026-04-25 22:13:06] Epoch 1 | Step 420 | Loss: 0.3388 | LM: 0.3508 | LB: 1.1614 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.479/SR1: 0.456 | LR: 7.10e-05
67
+ [2026-04-25 22:13:13] Epoch 1 | Step 430 | Loss: 0.3385 | LM: 0.3490 | LB: 1.1611 | CL0: 2.8 | CL1: 2.1 | HR0: 0.358/SR0: 0.356 | HR1: 0.479/SR1: 0.456 | LR: 7.22e-05
68
+ [2026-04-25 22:13:20] Epoch 1 | Step 440 | Loss: 0.3366 | LM: 0.3471 | LB: 1.1608 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.355 | HR1: 0.479/SR1: 0.456 | LR: 7.34e-05
69
+ [2026-04-25 22:13:27] Epoch 1 | Step 450 | Loss: 0.3359 | LM: 0.3465 | LB: 1.1605 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.456 | LR: 7.46e-05
70
+ [2026-04-25 22:13:34] Epoch 1 | Step 460 | Loss: 0.3351 | LM: 0.3451 | LB: 1.1606 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.456 | LR: 7.58e-05
71
+ [2026-04-25 22:13:42] Epoch 1 | Step 470 | Loss: 0.3337 | LM: 0.3433 | LB: 1.1603 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.456 | LR: 7.70e-05
72
+ [2026-04-25 22:13:49] Epoch 1 | Step 480 | Loss: 0.3327 | LM: 0.3415 | LB: 1.1603 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.456 | LR: 7.83e-05
73
+ [2026-04-25 22:13:56] Epoch 1 | Step 490 | Loss: 0.3316 | LM: 0.3398 | LB: 1.1603 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.456 | LR: 7.95e-05
74
+ [2026-04-25 22:14:03] Epoch 1 | Step 500 | Loss: 0.3307 | LM: 0.3373 | LB: 1.1600 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.455 | LR: 8.07e-05
75
+ [2026-04-25 22:14:10] Epoch 1 | Step 510 | Loss: 0.3293 | LM: 0.3345 | LB: 1.1598 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.478/SR1: 0.455 | LR: 8.19e-05
76
+ [2026-04-25 22:14:17] Epoch 1 | Step 520 | Loss: 0.3292 | LM: 0.3348 | LB: 1.1594 | CL0: 2.8 | CL1: 2.1 | HR0: 0.357/SR0: 0.356 | HR1: 0.477/SR1: 0.455 | LR: 8.31e-05
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/files/wandb-metadata.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-176-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.12.0",
4
+ "startedAt": "2026-04-25T22:06:53.291858Z",
5
+ "args": [
6
+ "tracking=wandb",
7
+ "tracking.project=code-completion_lr-sweep",
8
+ "tracking.run_name=hnet_xl_code_lr_2e-4",
9
+ "training.lr=2e-4",
10
+ "paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4",
11
+ "data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full"
12
+ ],
13
+ "program": "/workspace/byte-llms-code/code_completion_exp/train_hnet/train.py",
14
+ "codePath": "code_completion_exp/train_hnet/train.py",
15
+ "codePathLocal": "train.py",
16
+ "git": {
17
+ "remote": "https://github.com/naryst/byte-llms-code.git",
18
+ "commit": "0a7180b6ab9f63d2794494f09ec4918576d10fa2"
19
+ },
20
+ "email": "nikita@local.ru",
21
+ "root": "/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4",
22
+ "host": "7504e518d24a",
23
+ "executable": "/venv/bytellm/bin/python",
24
+ "cpu_count": 64,
25
+ "cpu_count_logical": 128,
26
+ "gpu": "NVIDIA H100 80GB HBM3",
27
+ "gpu_count": 4,
28
+ "disk": {
29
+ "/": {
30
+ "total": "265214230528",
31
+ "used": "121383002112"
32
+ }
33
+ },
34
+ "memory": {
35
+ "total": "1081679683584"
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA H100 80GB HBM3",
40
+ "memoryTotal": "85520809984",
41
+ "cudaCores": 16896,
42
+ "architecture": "Hopper",
43
+ "uuid": "GPU-b60cdcab-2033-2009-41de-be646c953a20"
44
+ },
45
+ {
46
+ "name": "NVIDIA H100 80GB HBM3",
47
+ "memoryTotal": "85520809984",
48
+ "cudaCores": 16896,
49
+ "architecture": "Hopper",
50
+ "uuid": "GPU-9982b420-4520-4238-c378-ec5a46015474"
51
+ },
52
+ {
53
+ "name": "NVIDIA H100 80GB HBM3",
54
+ "memoryTotal": "85520809984",
55
+ "cudaCores": 16896,
56
+ "architecture": "Hopper",
57
+ "uuid": "GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f"
58
+ },
59
+ {
60
+ "name": "NVIDIA H100 80GB HBM3",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper",
64
+ "uuid": "GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134"
65
+ }
66
+ ],
67
+ "cudaVersion": "12.2",
68
+ "writerId": "pfqfn7olxjo5871ytqpsm7un6nj33lqi"
69
+ }
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T22:06:53.377874817Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp1gnxdtqe/port-191277.txt","pid":191277,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-25T22:06:53.378924529Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":191277}
3
+ {"time":"2026-04-25T22:06:53.378905634Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-191277-191337-3381656369/socket","Net":"unix"}}
4
+ {"time":"2026-04-25T22:06:53.566394193Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-25T22:06:53.589483419Z","level":"INFO","msg":"handleInformInit: received","streamId":"ln6tfunh","id":"1(@)"}
6
+ {"time":"2026-04-25T22:06:53.95731224Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ln6tfunh","id":"1(@)"}
7
+ {"time":"2026-04-25T22:14:21.261832239Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug-internal.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T22:06:53.589583491Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-25T22:06:53.957135992Z","level":"INFO","msg":"stream: created new stream","id":"ln6tfunh"}
3
+ {"time":"2026-04-25T22:06:53.957196965Z","level":"INFO","msg":"handler: started","stream_id":"ln6tfunh"}
4
+ {"time":"2026-04-25T22:06:53.957305912Z","level":"INFO","msg":"stream: started","id":"ln6tfunh"}
5
+ {"time":"2026-04-25T22:06:53.95731694Z","level":"INFO","msg":"writer: started","stream_id":"ln6tfunh"}
6
+ {"time":"2026-04-25T22:06:53.957316732Z","level":"INFO","msg":"sender: started","stream_id":"ln6tfunh"}
7
+ {"time":"2026-04-25T22:06:54.080245448Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug.log ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_setup.py:_flush():81] Configure stats pid to 191277
3
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug.log
5
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/logs/debug-internal.log
6
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 3, 'batch_size': 4, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0002, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': None}, 'logging': {'log_interval': 10, 'save_interval': 3000, 'eval_interval': 1000, 'save_every_epoch': True}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_lr-sweep', 'run_name': 'hnet_xl_code_lr_2e-4', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4'}, 'paths': {'output_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-04-25 22:06:53,293 INFO MainThread:191277 [wandb_init.py:init():892] starting backend
10
+ 2026-04-25 22:06:53,566 INFO MainThread:191277 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-25 22:06:53,588 INFO MainThread:191277 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-25 22:06:53,591 INFO MainThread:191277 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-25 22:06:53,608 INFO MainThread:191277 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-25 22:06:54,079 INFO MainThread:191277 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-25 22:06:54,239 INFO MainThread:191277 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-25 22:06:54,239 INFO MainThread:191277 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-25 22:06:54,239 INFO MainThread:191277 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-25 22:06:54,239 INFO MainThread:191277 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-25 22:06:54,242 INFO MainThread:191277 [wandb_init.py:init():1084] run started, returning control to user process
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_220653-ln6tfunh/run-ln6tfunh.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e72032881e7c3bb7ff50124d376cbd2659ebaa1b64f4823985307f64980f864a
3
+ size 163840
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/code/code_completion_exp/train_hnet/train.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для HNet модели на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=hnet_small
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ from pathlib import Path
33
+
34
+ import torch
35
+ import hydra
36
+ from hydra.core.hydra_config import HydraConfig
37
+ from omegaconf import DictConfig, OmegaConf
38
+ from accelerate import Accelerator
39
+ from accelerate.utils import set_seed as accelerate_set_seed
40
+
41
+ # HNet imports
42
+ from hnet.load_utils import load_from_pretrained, load_from_config
43
+ from hnet.utils.tokenizers import ByteTokenizer
44
+ from hnet.utils.train import group_params
45
+
46
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
47
+ import sys
48
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
49
+
50
+ # Shared training library
51
+ from training_lib.utils import log_message
52
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
53
+ from training_lib.schedulers import get_lr_scheduler
54
+ from training_lib.tracking import init_tracking, finish_tracking
55
+ from training_lib.hnet.train_loop import train_epoch
56
+ from training_lib.hnet.data import create_dataloaders
57
+
58
+
59
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
60
+ def main(cfg: DictConfig):
61
+ """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
62
+
63
+ # === Accelerator Setup ===
64
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
65
+
66
+ accelerator = Accelerator(
67
+ mixed_precision=mixed_precision,
68
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
69
+ )
70
+
71
+ # === Setup ===
72
+ accelerate_set_seed(cfg.seed)
73
+
74
+ if cfg.paths.output_dir is None:
75
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
76
+
77
+ OmegaConf.resolve(cfg)
78
+
79
+ log_message(
80
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
81
+ cfg,
82
+ accelerator,
83
+ )
84
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
85
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
86
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
87
+
88
+ log_message("=" * 60, cfg, accelerator)
89
+ log_message(
90
+ "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
91
+ )
92
+ log_message("=" * 60, cfg, accelerator)
93
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
94
+
95
+ # === Trackio Init ===
96
+ init_tracking(cfg, accelerator)
97
+
98
+ # === Tokenizer ===
99
+ log_message("Initializing tokenizer...", cfg, accelerator)
100
+ tokenizer = ByteTokenizer()
101
+
102
+ # === Model ===
103
+ log_message("Loading model...", cfg, accelerator)
104
+ if cfg.model.checkpoint_path:
105
+ model = load_from_pretrained(
106
+ model_path=cfg.model.checkpoint_path,
107
+ model_config_path=cfg.model.config_path,
108
+ )
109
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
110
+ else:
111
+ model = load_from_config(
112
+ model_config_path=cfg.model.config_path,
113
+ device="cpu",
114
+ )
115
+ model.init_weights()
116
+ log_message("Initialized from scratch", cfg, accelerator)
117
+
118
+ model.train()
119
+
120
+ # LR multiplier для разны�� стадий (до prepare!)
121
+ lr_multiplier = list(cfg.training.lr_multiplier)
122
+ model.apply_lr_multiplier(lr_multiplier)
123
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
124
+
125
+ # Warmup для Triton kernels
126
+ if cfg.training.warmup_model:
127
+ log_message("Warming up model...", cfg, accelerator)
128
+ model = model.to(accelerator.device)
129
+ model.warmup(verbose=accelerator.is_main_process)
130
+
131
+ # Log model info
132
+ total_params = sum(p.numel() for p in model.parameters())
133
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
134
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
135
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
136
+
137
+ # === Data ===
138
+ log_message("Creating dataloaders...", cfg, accelerator)
139
+ dataloaders = create_dataloaders(cfg, tokenizer)
140
+
141
+ train_dataloader = dataloaders["train"]
142
+ val_dataloader = dataloaders.get("validation", None)
143
+
144
+ log_message(
145
+ f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
146
+ )
147
+ log_message(
148
+ f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
149
+ cfg,
150
+ accelerator,
151
+ )
152
+
153
+ if val_dataloader:
154
+ log_message(
155
+ f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
156
+ )
157
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
158
+ else:
159
+ log_message("No validation dataset found", cfg, accelerator)
160
+
161
+ # === Optimizer ===
162
+ log_message("Creating optimizer...", cfg, accelerator)
163
+ param_groups = group_params(model)
164
+
165
+ for group in param_groups:
166
+ if "lr" not in group:
167
+ group["lr"] = cfg.training.lr
168
+ else:
169
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
170
+ if "weight_decay" not in group:
171
+ group["weight_decay"] = cfg.training.weight_decay
172
+
173
+ optimizer = torch.optim.AdamW(
174
+ param_groups,
175
+ lr=cfg.training.lr,
176
+ betas=tuple(cfg.training.betas),
177
+ eps=cfg.training.eps,
178
+ )
179
+
180
+ # === Scheduler ===
181
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
182
+ total_steps = (
183
+ cfg.training.epochs
184
+ * steps_per_epoch
185
+ // cfg.training.gradient_accumulation_steps
186
+ )
187
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
188
+
189
+ log_message(
190
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
191
+ cfg,
192
+ accelerator,
193
+ )
194
+
195
+ # === Accelerate Prepare ===
196
+ log_message(
197
+ "Preparing model, optimizer, and dataloaders with Accelerate...",
198
+ cfg,
199
+ accelerator,
200
+ )
201
+
202
+ if val_dataloader is not None:
203
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
204
+ accelerator.prepare(
205
+ model, optimizer, train_dataloader, val_dataloader, scheduler
206
+ )
207
+ )
208
+ else:
209
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
210
+ model, optimizer, train_dataloader, scheduler
211
+ )
212
+
213
+ log_message(
214
+ f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
215
+ cfg,
216
+ accelerator,
217
+ )
218
+
219
+ # === Resume ===
220
+ global_step = 0
221
+ start_epoch = 1
222
+
223
+ if cfg.training.resume and cfg.training.resume_checkpoint:
224
+ global_step, start_epoch = load_checkpoint(
225
+ model,
226
+ optimizer,
227
+ scheduler,
228
+ cfg.training.resume_checkpoint,
229
+ cfg,
230
+ accelerator,
231
+ )
232
+ start_epoch += 1
233
+
234
+ # === Training Loop ===
235
+ log_message("Starting training...", cfg, accelerator)
236
+
237
+ best_val_loss = float("inf")
238
+
239
+ try:
240
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
241
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
242
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
243
+ log_message(f"{'=' * 60}", cfg, accelerator)
244
+
245
+ global_step, best_val_loss = train_epoch(
246
+ model=model,
247
+ dataloader=train_dataloader,
248
+ optimizer=optimizer,
249
+ scheduler=scheduler,
250
+ cfg=cfg,
251
+ epoch=epoch,
252
+ global_step=global_step,
253
+ accelerator=accelerator,
254
+ val_dataloader=val_dataloader,
255
+ best_val_loss=best_val_loss,
256
+ )
257
+
258
+ if cfg.logging.save_every_epoch:
259
+ save_checkpoint(
260
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
261
+ )
262
+
263
+ except KeyboardInterrupt:
264
+ log_message("Training interrupted by user", cfg, accelerator)
265
+ save_checkpoint(
266
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
267
+ )
268
+
269
+ # === Final Save ===
270
+ log_message("\nTraining completed!", cfg, accelerator)
271
+
272
+ if accelerator.is_main_process:
273
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
274
+ unwrapped_model = accelerator.unwrap_model(model)
275
+ torch.save(unwrapped_model.state_dict(), final_model_path)
276
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
277
+
278
+ accelerator.wait_for_everyone()
279
+ accelerator.end_training()
280
+ finish_tracking()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/config.yaml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ code_path: code/code_completion_exp/train_hnet/train.py
7
+ python_version: 3.12.0
8
+ cli_version: 0.24.0
9
+ framework: huggingface
10
+ huggingface_version: 4.57.6
11
+ is_jupyter_run: false
12
+ is_kaggle_kernel: false
13
+ start_time: 1777155612
14
+ t:
15
+ 1:
16
+ - 1
17
+ - 11
18
+ - 49
19
+ - 50
20
+ - 51
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 50
28
+ - 51
29
+ - 71
30
+ - 105
31
+ 3:
32
+ - 2
33
+ - 13
34
+ - 16
35
+ - 37
36
+ - 42
37
+ - 61
38
+ 4: 3.12.0
39
+ 5: 0.24.0
40
+ 6: 4.57.6
41
+ 13: linux-x86_64
42
+ e:
43
+ i2jx9zm2jjq81elpzo2fmxkizkbg0bw5:
44
+ os: Linux-5.4.0-176-generic-x86_64-with-glibc2.35
45
+ python: CPython 3.12.0
46
+ started_at: '2026-04-25T22:20:11.717689Z'
47
+ args:
48
+ - tracking=wandb
49
+ - tracking.project=code-completion_lr-sweep
50
+ - tracking.run_name=hnet_xl_code_lr_2e-4
51
+ - training.lr=2e-4
52
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
53
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
54
+ program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
55
+ code_path: code_completion_exp/train_hnet/train.py
56
+ code_path_local: train.py
57
+ git:
58
+ remote_url: https://github.com/naryst/byte-llms-code.git
59
+ commit: 0a7180b6ab9f63d2794494f09ec4918576d10fa2
60
+ email: nikita@local.ru
61
+ root: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
62
+ host: 7504e518d24a
63
+ executable: /venv/bytellm/bin/python
64
+ cpu_count: 64
65
+ cpu_count_logical: 128
66
+ gpu_type: NVIDIA H100 80GB HBM3
67
+ gpu_count: 4
68
+ disk:
69
+ /:
70
+ total: '265214230528'
71
+ used: '121389543424'
72
+ memory:
73
+ total: '1081679683584'
74
+ gpu_nvidia:
75
+ - name: NVIDIA H100 80GB HBM3
76
+ memory_total: '85520809984'
77
+ cuda_cores: 16896
78
+ architecture: Hopper
79
+ uuid: GPU-b60cdcab-2033-2009-41de-be646c953a20
80
+ - name: NVIDIA H100 80GB HBM3
81
+ memory_total: '85520809984'
82
+ cuda_cores: 16896
83
+ architecture: Hopper
84
+ uuid: GPU-9982b420-4520-4238-c378-ec5a46015474
85
+ - name: NVIDIA H100 80GB HBM3
86
+ memory_total: '85520809984'
87
+ cuda_cores: 16896
88
+ architecture: Hopper
89
+ uuid: GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f
90
+ - name: NVIDIA H100 80GB HBM3
91
+ memory_total: '85520809984'
92
+ cuda_cores: 16896
93
+ architecture: Hopper
94
+ uuid: GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134
95
+ cuda_version: '12.2'
96
+ writer_id: i2jx9zm2jjq81elpzo2fmxkizkbg0bw5
97
+ model:
98
+ desc: null
99
+ value:
100
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
101
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
102
+ training:
103
+ desc: null
104
+ value:
105
+ epochs: 1
106
+ batch_size: 4
107
+ eval_batch_size: 24
108
+ gradient_accumulation_steps: 4
109
+ lr: 0.0002
110
+ weight_decay: 0.1
111
+ betas:
112
+ - 0.9
113
+ - 0.95
114
+ eps: 1.0e-08
115
+ lr_scheduler: wsd
116
+ warmup_ratio: 0.1
117
+ decay_ratio: 0.2
118
+ warmup_steps: 100
119
+ min_lr_ratio: 0.1
120
+ lr_multiplier:
121
+ - 2.0
122
+ - 1.5
123
+ - 1.0
124
+ load_balancing_weight: 0.01
125
+ load_balancing_N: 4.0
126
+ max_grad_norm: 1.0
127
+ use_amp: true
128
+ resume: false
129
+ resume_checkpoint: null
130
+ warmup_model: true
131
+ data:
132
+ desc: null
133
+ value:
134
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
135
+ max_context_len: 4096
136
+ max_target_len: 256
137
+ num_workers: 0
138
+ pin_memory: true
139
+ max_train_samples: null
140
+ max_val_samples: 2000
141
+ logging:
142
+ desc: null
143
+ value:
144
+ log_interval: 10
145
+ save_interval: 0
146
+ eval_interval: 2000
147
+ save_every_epoch: false
148
+ tracking:
149
+ desc: null
150
+ value:
151
+ enabled: true
152
+ backend: wandb
153
+ project: code-completion_lr-sweep
154
+ run_name: hnet_xl_code_lr_2e-4
155
+ entity: null
156
+ base_url: https://wandb.platun0v.ru
157
+ local_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
158
+ paths:
159
+ desc: null
160
+ value:
161
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4
162
+ seed:
163
+ desc: null
164
+ value: 42
165
+ device:
166
+ desc: null
167
+ value: cuda
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/wandb-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"os": "Linux-5.4.0-176-generic-x86_64-with-glibc2.35", "python": "CPython 3.12.0", "started_at": "2026-04-25T22:20:11.717689Z", "args": ["tracking=wandb", "tracking.project=code-completion_lr-sweep", "tracking.run_name=hnet_xl_code_lr_2e-4", "training.lr=2e-4", "paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4", "data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full"], "program": "/workspace/byte-llms-code/code_completion_exp/train_hnet/train.py", "code_path": "code_completion_exp/train_hnet/train.py", "code_path_local": "train.py", "git": {"remote_url": "https://github.com/naryst/byte-llms-code.git", "commit": "0a7180b6ab9f63d2794494f09ec4918576d10fa2"}, "email": "nikita@local.ru", "root": "/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4", "host": "7504e518d24a", "executable": "/venv/bytellm/bin/python", "cpu_count": 64, "cpu_count_logical": 128, "gpu_type": "NVIDIA H100 80GB HBM3", "gpu_count": 4, "disk": {"/": {"total": "265214230528", "used": "121389543424"}}, "memory": {"total": "1081679683584"}, "gpu_nvidia": [{"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-b60cdcab-2033-2009-41de-be646c953a20"}, {"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-9982b420-4520-4238-c378-ec5a46015474"}, {"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f"}, {"name": "NVIDIA H100 80GB HBM3", "memory_total": "85520809984", "cuda_cores": 16896, "architecture": "Hopper", "uuid": "GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134"}], "cuda_version": "12.2", "writer_id": "i2jx9zm2jjq81elpzo2fmxkizkbg0bw5"}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime": 7157, "train/loss_avg": 0.30111300496024884, "train/lm_loss": 0.22366735339164734, "train/step_time": 0.7080746650695801, "train/hard_boundary_ratio_stage0": 0.35731984922330645, "train/soft_boundary_ratio_stage1": 0.3884844363677051, "_step": 9880, "train/lb_loss": 1.1152780055999756, "train/lr": 2e-05, "train/chunk_len_stage1": 2.422819767367862, "train/loss": 0.19969405978918076, "train/epoch": 1, "train/chunk_len_stage0": 2.819989316907469, "train/hard_boundary_ratio_stage1": 0.41531218191156627, "train/soft_boundary_ratio_stage0": 0.35144078876136436, "_timestamp": 1777162766.8844316, "val/perplexity": 1.3305252412493118, "val/lb_loss": 1.0830148203032357, "val/lm_loss": 0.2882710979098365, "val/time": 4.704254865646362, "best/step": 8000, "val/loss": 0.29910124660957427, "best/val_loss": 0.29910124660957427, "best/val_perplexity": 1.3305252412493118, "epoch/time": 7104.988562345505, "epoch/soft_boundary_ratio_stage0": 0.3514468830948854, "epoch/loss": 0.30111785081837467, "epoch/hard_boundary_ratio_stage0": 0.3573285990023209, "epoch/lm_loss": 0.2911626837279241, "epoch/lb_loss": 1.0948496247683472, "epoch/chunk_len_stage0": 2.8199204649604765, "epoch/chunk_len_stage1": 2.4228836114764665, "epoch/hard_boundary_ratio_stage1": 0.41530060060195706, "epoch/soft_boundary_ratio_stage1": 0.38847202075385673}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T22:20:11.804586482Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpq3353tb5/port-198705.txt","pid":198705,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-25T22:20:11.804980932Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":198705}
3
+ {"time":"2026-04-25T22:20:11.804989464Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-198705-198765-1964719290/socket","Net":"unix"}}
4
+ {"time":"2026-04-25T22:20:11.99240949Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-25T22:20:12.017529004Z","level":"INFO","msg":"handleInformInit: received","streamId":"khn25dwv","id":"1(@)"}
6
+ {"time":"2026-04-25T22:20:12.401503529Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"khn25dwv","id":"1(@)"}
7
+ {"time":"2026-04-26T00:19:30.37366386Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"khn25dwv","id":"1(@)"}
8
+ {"time":"2026-04-26T00:19:30.37416302Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"khn25dwv","id":"1(@)"}
9
+ {"time":"2026-04-26T00:19:30.385004444Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2026-04-26T00:19:30.385029606Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2026-04-26T00:19:30.38503557Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2026-04-26T00:19:30.385040394Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2026-04-26T00:19:30.385092793Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2026-04-26T00:19:30.385102377Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2026-04-26T00:19:30.385084287Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-198705-198765-1964719290/socket","Net":"unix"}}
16
+ {"time":"2026-04-26T00:19:30.385111472Z","level":"INFO","msg":"server is closed"}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug-internal.log ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-25T22:20:12.017648642Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-25T22:20:12.401332467Z","level":"INFO","msg":"stream: created new stream","id":"khn25dwv"}
3
+ {"time":"2026-04-25T22:20:12.401402455Z","level":"INFO","msg":"handler: started","stream_id":"khn25dwv"}
4
+ {"time":"2026-04-25T22:20:12.401496869Z","level":"INFO","msg":"stream: started","id":"khn25dwv"}
5
+ {"time":"2026-04-25T22:20:12.401507601Z","level":"INFO","msg":"writer: started","stream_id":"khn25dwv"}
6
+ {"time":"2026-04-25T22:20:12.40151694Z","level":"INFO","msg":"sender: started","stream_id":"khn25dwv"}
7
+ {"time":"2026-04-25T22:20:12.5289538Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-04-25T23:20:22.74400376Z","level":"ERROR","msg":"api: HTTP error","status":403,"method":"POST","url":"https://wandb.platun0v.ru/files/nikita/code-completion_lr-sweep/khn25dwv/file_stream"}
9
+ {"time":"2026-04-25T23:20:22.744078123Z","level":"ERROR+4","msg":"filestream: fatal error: filestream: failed to upload: 403 Forbidden url=https://wandb.platun0v.ru/files/nikita/code-completion_lr-sweep/khn25dwv/file_stream: "}
10
+ {"time":"2026-04-26T00:19:30.370077692Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
11
+ {"time":"2026-04-26T00:19:30.370863672Z","level":"INFO","msg":"handler: operation stats","stats":{}}
12
+ {"time":"2026-04-26T00:19:30.373711643Z","level":"INFO","msg":"stream: closing","id":"khn25dwv"}
13
+ {"time":"2026-04-26T00:19:30.373733071Z","level":"INFO","msg":"handler: closed","stream_id":"khn25dwv"}
14
+ {"time":"2026-04-26T00:19:30.37382892Z","level":"INFO","msg":"sender: closed","stream_id":"khn25dwv"}
15
+ {"time":"2026-04-26T00:19:30.373836548Z","level":"INFO","msg":"stream: closed","id":"khn25dwv"}
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_setup.py:_flush():81] Configure stats pid to 198705
3
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug.log
5
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/logs/debug-internal.log
6
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 1, 'batch_size': 4, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0002, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': 2000}, 'logging': {'log_interval': 10, 'save_interval': 0, 'eval_interval': 2000, 'save_every_epoch': False}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_lr-sweep', 'run_name': 'hnet_xl_code_lr_2e-4', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4'}, 'paths': {'output_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_2e-4'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-04-25 22:20:11,719 INFO MainThread:198705 [wandb_init.py:init():892] starting backend
10
+ 2026-04-25 22:20:11,992 INFO MainThread:198705 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-25 22:20:12,016 INFO MainThread:198705 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-25 22:20:12,019 INFO MainThread:198705 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-25 22:20:12,040 INFO MainThread:198705 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-25 22:20:12,528 INFO MainThread:198705 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-25 22:20:12,685 INFO MainThread:198705 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-25 22:20:12,685 INFO MainThread:198705 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-25 22:20:12,685 INFO MainThread:198705 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-25 22:20:12,686 INFO MainThread:198705 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-25 22:20:12,688 INFO MainThread:198705 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-04-26 00:19:29,581 INFO MainThread:198705 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_lr-sweep/khn25dwv
21
+ 2026-04-26 00:19:29,582 INFO MainThread:198705 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-04-26 00:19:29,582 INFO MainThread:198705 [wandb_run.py:_restore():2476] restore
23
+ 2026-04-26 00:19:29,582 INFO MainThread:198705 [wandb_run.py:_restore():2482] restore done
24
+ 2026-04-26 00:19:30,373 INFO MainThread:198705 [wandb_run.py:_footer_sync_info():3870] logging synced files
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17f5b1390172e14c28b32a79085198d72420d310c8a5d213678c06b1e2749a4
3
+ size 3073785
lr_sweep/hnet_xl_code_lr_2e-4/wandb/run-20260425_222011-khn25dwv/run-khn25dwv.wandb.synced ADDED
File without changes
lr_sweep/hnet_xl_code_lr_5e-4/.hydra/config.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ config_path: ${oc.env:PROJECT_ROOT}/hnet_project/configs/hnet_2stage_XL_code.json
3
+ checkpoint_path: ${oc.env:PROJECT_ROOT}/hnet_project/checkpoints/hnet_2stage_XL_code.pt
4
+ training:
5
+ epochs: 1
6
+ batch_size: 4
7
+ eval_batch_size: 24
8
+ gradient_accumulation_steps: 4
9
+ lr: 0.0005
10
+ weight_decay: 0.1
11
+ betas:
12
+ - 0.9
13
+ - 0.95
14
+ eps: 1.0e-08
15
+ lr_scheduler: wsd
16
+ warmup_ratio: 0.1
17
+ decay_ratio: 0.2
18
+ warmup_steps: 100
19
+ min_lr_ratio: 0.1
20
+ lr_multiplier:
21
+ - 2.0
22
+ - 1.5
23
+ - 1.0
24
+ load_balancing_weight: 0.01
25
+ load_balancing_N: 4.0
26
+ max_grad_norm: 1.0
27
+ use_amp: true
28
+ resume: false
29
+ resume_checkpoint: null
30
+ warmup_model: true
31
+ data:
32
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
33
+ max_context_len: 4096
34
+ max_target_len: 256
35
+ num_workers: 0
36
+ pin_memory: true
37
+ max_train_samples: null
38
+ max_val_samples: 2000
39
+ logging:
40
+ log_interval: 10
41
+ save_interval: 0
42
+ eval_interval: 2000
43
+ save_every_epoch: false
44
+ tracking:
45
+ enabled: true
46
+ backend: wandb
47
+ project: code-completion_lr-sweep
48
+ run_name: hnet_xl_code_lr_5e-4
49
+ entity: null
50
+ base_url: https://wandb.platun0v.ru
51
+ local_dir: ${paths.output_dir}
52
+ paths:
53
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
54
+ seed: 42
55
+ device: cuda
lr_sweep/hnet_xl_code_lr_5e-4/.hydra/hydra.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - tracking=wandb
116
+ - tracking.project=code-completion_lr-sweep
117
+ - tracking.run_name=hnet_xl_code_lr_5e-4
118
+ - training.lr=5e-4
119
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
120
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
121
+ job:
122
+ name: train
123
+ chdir: false
124
+ override_dirname: data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full,paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4,tracking.project=code-completion_lr-sweep,tracking.run_name=hnet_xl_code_lr_5e-4,tracking=wandb,training.lr=5e-4
125
+ id: ???
126
+ num: ???
127
+ config_name: config
128
+ env_set: {}
129
+ env_copy: []
130
+ config:
131
+ override_dirname:
132
+ kv_sep: '='
133
+ item_sep: ','
134
+ exclude_keys: []
135
+ runtime:
136
+ version: 1.3.2
137
+ version_base: '1.3'
138
+ cwd: /workspace/byte-llms-code/code_completion_exp/train_hnet
139
+ config_sources:
140
+ - path: hydra.conf
141
+ schema: pkg
142
+ provider: hydra
143
+ - path: /workspace/byte-llms-code/code_completion_exp/train_hnet/configs
144
+ schema: file
145
+ provider: main
146
+ - path: ''
147
+ schema: structured
148
+ provider: schema
149
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
150
+ choices:
151
+ paths: default
152
+ tracking: wandb
153
+ logging: default
154
+ data: default
155
+ training: default
156
+ model: hnet_xl_code
157
+ hydra/env: default
158
+ hydra/callbacks: null
159
+ hydra/job_logging: default
160
+ hydra/hydra_logging: default
161
+ hydra/hydra_help: default
162
+ hydra/help: default
163
+ hydra/sweeper: basic
164
+ hydra/launcher: basic
165
+ hydra/output: default
166
+ verbose: false
lr_sweep/hnet_xl_code_lr_5e-4/.hydra/overrides.yaml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ - tracking=wandb
2
+ - tracking.project=code-completion_lr-sweep
3
+ - tracking.run_name=hnet_xl_code_lr_5e-4
4
+ - training.lr=5e-4
5
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
6
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
lr_sweep/hnet_xl_code_lr_5e-4/model_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca40ecd3344a7b1e8dd106b2e920a9fd77225fbebe1fc4969d809db4074cf77
3
+ size 3315165484
lr_sweep/hnet_xl_code_lr_5e-4/wandb/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-26T00:19:41.489729754Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-26T00:19:41.862602753Z","level":"INFO","msg":"stream: created new stream","id":"i6lt8av0"}
3
+ {"time":"2026-04-26T00:19:41.862649982Z","level":"INFO","msg":"handler: started","stream_id":"i6lt8av0"}
4
+ {"time":"2026-04-26T00:19:41.862747075Z","level":"INFO","msg":"stream: started","id":"i6lt8av0"}
5
+ {"time":"2026-04-26T00:19:41.862757167Z","level":"INFO","msg":"writer: started","stream_id":"i6lt8av0"}
6
+ {"time":"2026-04-26T00:19:41.862759475Z","level":"INFO","msg":"sender: started","stream_id":"i6lt8av0"}
7
+ {"time":"2026-04-26T00:19:41.985346468Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-04-26T02:18:38.932345179Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-04-26T02:18:39.035750391Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
+ {"time":"2026-04-26T02:18:39.038119733Z","level":"INFO","msg":"stream: closing","id":"i6lt8av0"}
11
+ {"time":"2026-04-26T02:18:39.038132189Z","level":"INFO","msg":"handler: closed","stream_id":"i6lt8av0"}
12
+ {"time":"2026-04-26T02:18:39.038193257Z","level":"INFO","msg":"sender: closed","stream_id":"i6lt8av0"}
13
+ {"time":"2026-04-26T02:18:39.038198898Z","level":"INFO","msg":"stream: closed","id":"i6lt8av0"}
lr_sweep/hnet_xl_code_lr_5e-4/wandb/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_setup.py:_flush():81] Configure stats pid to 257611
3
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug.log
5
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug-internal.log
6
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 1, 'batch_size': 4, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0005, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': 2000}, 'logging': {'log_interval': 10, 'save_interval': 0, 'eval_interval': 2000, 'save_every_epoch': False}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_lr-sweep', 'run_name': 'hnet_xl_code_lr_5e-4', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4'}, 'paths': {'output_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-04-26 00:19:41,199 INFO MainThread:257611 [wandb_init.py:init():892] starting backend
10
+ 2026-04-26 00:19:41,468 INFO MainThread:257611 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-26 00:19:41,488 INFO MainThread:257611 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-26 00:19:41,491 INFO MainThread:257611 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-26 00:19:41,508 INFO MainThread:257611 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-26 00:19:41,984 INFO MainThread:257611 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-26 00:19:42,145 INFO MainThread:257611 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-04-26 02:18:37,753 INFO MainThread:257611 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_lr-sweep/i6lt8av0
21
+ 2026-04-26 02:18:37,754 INFO MainThread:257611 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-04-26 02:18:37,754 INFO MainThread:257611 [wandb_run.py:_restore():2476] restore
23
+ 2026-04-26 02:18:37,754 INFO MainThread:257611 [wandb_run.py:_restore():2482] restore done
24
+ 2026-04-26 02:18:39,037 INFO MainThread:257611 [wandb_run.py:_footer_sync_info():3870] logging synced files
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/code/code_completion_exp/train_hnet/train.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для HNet модели на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=hnet_small
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ from pathlib import Path
33
+
34
+ import torch
35
+ import hydra
36
+ from hydra.core.hydra_config import HydraConfig
37
+ from omegaconf import DictConfig, OmegaConf
38
+ from accelerate import Accelerator
39
+ from accelerate.utils import set_seed as accelerate_set_seed
40
+
41
+ # HNet imports
42
+ from hnet.load_utils import load_from_pretrained, load_from_config
43
+ from hnet.utils.tokenizers import ByteTokenizer
44
+ from hnet.utils.train import group_params
45
+
46
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
47
+ import sys
48
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
49
+
50
+ # Shared training library
51
+ from training_lib.utils import log_message
52
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
53
+ from training_lib.schedulers import get_lr_scheduler
54
+ from training_lib.tracking import init_tracking, finish_tracking
55
+ from training_lib.hnet.train_loop import train_epoch
56
+ from training_lib.hnet.data import create_dataloaders
57
+
58
+
59
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
60
+ def main(cfg: DictConfig):
61
+ """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
62
+
63
+ # === Accelerator Setup ===
64
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
65
+
66
+ accelerator = Accelerator(
67
+ mixed_precision=mixed_precision,
68
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
69
+ )
70
+
71
+ # === Setup ===
72
+ accelerate_set_seed(cfg.seed)
73
+
74
+ if cfg.paths.output_dir is None:
75
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
76
+
77
+ OmegaConf.resolve(cfg)
78
+
79
+ log_message(
80
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
81
+ cfg,
82
+ accelerator,
83
+ )
84
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
85
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
86
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
87
+
88
+ log_message("=" * 60, cfg, accelerator)
89
+ log_message(
90
+ "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
91
+ )
92
+ log_message("=" * 60, cfg, accelerator)
93
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
94
+
95
+ # === Trackio Init ===
96
+ init_tracking(cfg, accelerator)
97
+
98
+ # === Tokenizer ===
99
+ log_message("Initializing tokenizer...", cfg, accelerator)
100
+ tokenizer = ByteTokenizer()
101
+
102
+ # === Model ===
103
+ log_message("Loading model...", cfg, accelerator)
104
+ if cfg.model.checkpoint_path:
105
+ model = load_from_pretrained(
106
+ model_path=cfg.model.checkpoint_path,
107
+ model_config_path=cfg.model.config_path,
108
+ )
109
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
110
+ else:
111
+ model = load_from_config(
112
+ model_config_path=cfg.model.config_path,
113
+ device="cpu",
114
+ )
115
+ model.init_weights()
116
+ log_message("Initialized from scratch", cfg, accelerator)
117
+
118
+ model.train()
119
+
120
+ # LR multiplier для разны�� стадий (до prepare!)
121
+ lr_multiplier = list(cfg.training.lr_multiplier)
122
+ model.apply_lr_multiplier(lr_multiplier)
123
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
124
+
125
+ # Warmup для Triton kernels
126
+ if cfg.training.warmup_model:
127
+ log_message("Warming up model...", cfg, accelerator)
128
+ model = model.to(accelerator.device)
129
+ model.warmup(verbose=accelerator.is_main_process)
130
+
131
+ # Log model info
132
+ total_params = sum(p.numel() for p in model.parameters())
133
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
134
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
135
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
136
+
137
+ # === Data ===
138
+ log_message("Creating dataloaders...", cfg, accelerator)
139
+ dataloaders = create_dataloaders(cfg, tokenizer)
140
+
141
+ train_dataloader = dataloaders["train"]
142
+ val_dataloader = dataloaders.get("validation", None)
143
+
144
+ log_message(
145
+ f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
146
+ )
147
+ log_message(
148
+ f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
149
+ cfg,
150
+ accelerator,
151
+ )
152
+
153
+ if val_dataloader:
154
+ log_message(
155
+ f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
156
+ )
157
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
158
+ else:
159
+ log_message("No validation dataset found", cfg, accelerator)
160
+
161
+ # === Optimizer ===
162
+ log_message("Creating optimizer...", cfg, accelerator)
163
+ param_groups = group_params(model)
164
+
165
+ for group in param_groups:
166
+ if "lr" not in group:
167
+ group["lr"] = cfg.training.lr
168
+ else:
169
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
170
+ if "weight_decay" not in group:
171
+ group["weight_decay"] = cfg.training.weight_decay
172
+
173
+ optimizer = torch.optim.AdamW(
174
+ param_groups,
175
+ lr=cfg.training.lr,
176
+ betas=tuple(cfg.training.betas),
177
+ eps=cfg.training.eps,
178
+ )
179
+
180
+ # === Scheduler ===
181
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
182
+ total_steps = (
183
+ cfg.training.epochs
184
+ * steps_per_epoch
185
+ // cfg.training.gradient_accumulation_steps
186
+ )
187
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
188
+
189
+ log_message(
190
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
191
+ cfg,
192
+ accelerator,
193
+ )
194
+
195
+ # === Accelerate Prepare ===
196
+ log_message(
197
+ "Preparing model, optimizer, and dataloaders with Accelerate...",
198
+ cfg,
199
+ accelerator,
200
+ )
201
+
202
+ if val_dataloader is not None:
203
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
204
+ accelerator.prepare(
205
+ model, optimizer, train_dataloader, val_dataloader, scheduler
206
+ )
207
+ )
208
+ else:
209
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
210
+ model, optimizer, train_dataloader, scheduler
211
+ )
212
+
213
+ log_message(
214
+ f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
215
+ cfg,
216
+ accelerator,
217
+ )
218
+
219
+ # === Resume ===
220
+ global_step = 0
221
+ start_epoch = 1
222
+
223
+ if cfg.training.resume and cfg.training.resume_checkpoint:
224
+ global_step, start_epoch = load_checkpoint(
225
+ model,
226
+ optimizer,
227
+ scheduler,
228
+ cfg.training.resume_checkpoint,
229
+ cfg,
230
+ accelerator,
231
+ )
232
+ start_epoch += 1
233
+
234
+ # === Training Loop ===
235
+ log_message("Starting training...", cfg, accelerator)
236
+
237
+ best_val_loss = float("inf")
238
+
239
+ try:
240
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
241
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
242
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
243
+ log_message(f"{'=' * 60}", cfg, accelerator)
244
+
245
+ global_step, best_val_loss = train_epoch(
246
+ model=model,
247
+ dataloader=train_dataloader,
248
+ optimizer=optimizer,
249
+ scheduler=scheduler,
250
+ cfg=cfg,
251
+ epoch=epoch,
252
+ global_step=global_step,
253
+ accelerator=accelerator,
254
+ val_dataloader=val_dataloader,
255
+ best_val_loss=best_val_loss,
256
+ )
257
+
258
+ if cfg.logging.save_every_epoch:
259
+ save_checkpoint(
260
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
261
+ )
262
+
263
+ except KeyboardInterrupt:
264
+ log_message("Training interrupted by user", cfg, accelerator)
265
+ save_checkpoint(
266
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
267
+ )
268
+
269
+ # === Final Save ===
270
+ log_message("\nTraining completed!", cfg, accelerator)
271
+
272
+ if accelerator.is_main_process:
273
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
274
+ unwrapped_model = accelerator.unwrap_model(model)
275
+ torch.save(unwrapped_model.state_dict(), final_model_path)
276
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
277
+
278
+ accelerator.wait_for_everyone()
279
+ accelerator.end_training()
280
+ finish_tracking()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/config.yaml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.24.0
4
+ code_path: code/code_completion_exp/train_hnet/train.py
5
+ e:
6
+ dyau2sb185lrum72o3jj1efzj126ey29:
7
+ args:
8
+ - tracking=wandb
9
+ - tracking.project=code-completion_lr-sweep
10
+ - tracking.run_name=hnet_xl_code_lr_5e-4
11
+ - training.lr=5e-4
12
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
13
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
14
+ codePath: code_completion_exp/train_hnet/train.py
15
+ codePathLocal: train.py
16
+ cpu_count: 64
17
+ cpu_count_logical: 128
18
+ cudaVersion: "12.2"
19
+ disk:
20
+ /:
21
+ total: "265214230528"
22
+ used: "170465628160"
23
+ email: nikita@local.ru
24
+ executable: /venv/bytellm/bin/python
25
+ git:
26
+ commit: 0a7180b6ab9f63d2794494f09ec4918576d10fa2
27
+ remote: https://github.com/naryst/byte-llms-code.git
28
+ gpu: NVIDIA H100 80GB HBM3
29
+ gpu_count: 4
30
+ gpu_nvidia:
31
+ - architecture: Hopper
32
+ cudaCores: 16896
33
+ memoryTotal: "85520809984"
34
+ name: NVIDIA H100 80GB HBM3
35
+ uuid: GPU-b60cdcab-2033-2009-41de-be646c953a20
36
+ - architecture: Hopper
37
+ cudaCores: 16896
38
+ memoryTotal: "85520809984"
39
+ name: NVIDIA H100 80GB HBM3
40
+ uuid: GPU-9982b420-4520-4238-c378-ec5a46015474
41
+ - architecture: Hopper
42
+ cudaCores: 16896
43
+ memoryTotal: "85520809984"
44
+ name: NVIDIA H100 80GB HBM3
45
+ uuid: GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f
46
+ - architecture: Hopper
47
+ cudaCores: 16896
48
+ memoryTotal: "85520809984"
49
+ name: NVIDIA H100 80GB HBM3
50
+ uuid: GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134
51
+ host: 7504e518d24a
52
+ memory:
53
+ total: "1081679683584"
54
+ os: Linux-5.4.0-176-generic-x86_64-with-glibc2.35
55
+ program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
56
+ python: CPython 3.12.0
57
+ root: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
58
+ startedAt: "2026-04-26T00:19:41.197293Z"
59
+ writerId: dyau2sb185lrum72o3jj1efzj126ey29
60
+ m: []
61
+ python_version: 3.12.0
62
+ t:
63
+ "1":
64
+ - 1
65
+ - 11
66
+ - 49
67
+ - 50
68
+ - 51
69
+ - 71
70
+ - 105
71
+ "2":
72
+ - 1
73
+ - 11
74
+ - 49
75
+ - 50
76
+ - 51
77
+ - 71
78
+ - 105
79
+ "3":
80
+ - 2
81
+ - 13
82
+ - 16
83
+ - 61
84
+ "4": 3.12.0
85
+ "5": 0.24.0
86
+ "6": 4.57.6
87
+ "12": 0.24.0
88
+ "13": linux-x86_64
89
+ data:
90
+ value:
91
+ max_context_len: 4096
92
+ max_target_len: 256
93
+ max_train_samples: null
94
+ max_val_samples: 2000
95
+ num_workers: 0
96
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
97
+ pin_memory: true
98
+ device:
99
+ value: cuda
100
+ logging:
101
+ value:
102
+ eval_interval: 2000
103
+ log_interval: 10
104
+ save_every_epoch: false
105
+ save_interval: 0
106
+ model:
107
+ value:
108
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
109
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
110
+ paths:
111
+ value:
112
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
113
+ seed:
114
+ value: 42
115
+ tracking:
116
+ value:
117
+ backend: wandb
118
+ base_url: https://wandb.platun0v.ru
119
+ enabled: true
120
+ entity: null
121
+ local_dir: /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4
122
+ project: code-completion_lr-sweep
123
+ run_name: hnet_xl_code_lr_5e-4
124
+ training:
125
+ value:
126
+ batch_size: 4
127
+ betas:
128
+ - 0.9
129
+ - 0.95
130
+ decay_ratio: 0.2
131
+ epochs: 1
132
+ eps: 1e-08
133
+ eval_batch_size: 24
134
+ gradient_accumulation_steps: 4
135
+ load_balancing_N: 4
136
+ load_balancing_weight: 0.01
137
+ lr: 0.0005
138
+ lr_multiplier:
139
+ - 2
140
+ - 1.5
141
+ - 1
142
+ lr_scheduler: wsd
143
+ max_grad_norm: 1
144
+ min_lr_ratio: 0.1
145
+ resume: false
146
+ resume_checkpoint: null
147
+ use_amp: true
148
+ warmup_model: true
149
+ warmup_ratio: 0.1
150
+ warmup_steps: 100
151
+ weight_decay: 0.1
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/requirements.txt ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ setuptools==78.1.1
2
+ wheel==0.45.1
3
+ pip==25.2
4
+ webencodings==0.5.1
5
+ triton==3.2.0
6
+ pytz==2025.2
7
+ pydub==0.25.1
8
+ pure_eval==0.2.3
9
+ ptyprocess==0.7.0
10
+ nvidia-ml-py==13.590.48
11
+ nvidia-cusparselt-cu12==0.6.2
12
+ mpmath==1.3.0
13
+ ipython-genutils==0.2.0
14
+ fastjsonschema==2.21.2
15
+ brotli==1.2.0
16
+ antlr4-python3-runtime==4.9.3
17
+ xxhash==3.6.0
18
+ widgetsnbextension==4.0.14
19
+ websocket-client==1.9.0
20
+ webcolors==24.11.1
21
+ wcwidth==0.2.14
22
+ urllib3==2.5.0
23
+ uri-template==1.3.0
24
+ tzdata==2025.2
25
+ typing_extensions==4.15.0
26
+ types-python-dateutil==2.9.0.20251008
27
+ traitlets==5.14.3
28
+ tqdm==4.67.1
29
+ tornado==6.5.2
30
+ tomlkit==0.13.3
31
+ tinycss2==1.4.0
32
+ tabulate==0.9.0
33
+ sympy==1.13.1
34
+ soupsieve==2.8
35
+ sniffio==1.3.1
36
+ smmap==5.0.2
37
+ six==1.17.0
38
+ shellingham==1.5.4
39
+ Send2Trash==1.8.3
40
+ semantic-version==2.10.0
41
+ safetensors==0.6.2
42
+ rpds-py==0.27.1
43
+ rfc3986-validator==0.1.1
44
+ regex==2025.9.18
45
+ pyzmq==27.1.0
46
+ PyYAML==6.0.3
47
+ python-multipart==0.0.22
48
+ python-json-logger==4.0.0
49
+ python-dotenv==1.2.1
50
+ pyparsing==3.2.5
51
+ PyJWT==2.8.0
52
+ Pygments==2.19.2
53
+ pycparser==2.23
54
+ pyarrow==22.0.0
55
+ psutil==7.1.0
56
+ protobuf==6.33.4
57
+ propcache==0.4.1
58
+ prometheus_client==0.23.1
59
+ portalocker==3.2.0
60
+ platformdirs==4.5.0
61
+ pillow==11.3.0
62
+ pexpect==4.9.0
63
+ pathspec==1.0.4
64
+ parso==0.8.5
65
+ pandocfilters==1.5.1
66
+ packaging==25.0
67
+ orjson==3.11.6
68
+ opt_einsum==3.4.0
69
+ nvidia-nvtx-cu12==12.4.127
70
+ nvidia-nvjitlink-cu12==12.4.127
71
+ nvidia-nccl-cu12==2.21.5
72
+ nvidia-curand-cu12==10.3.5.147
73
+ nvidia-cufile-cu12==1.13.1.3
74
+ nvidia-cufft-cu12==11.2.1.3
75
+ nvidia-cuda-runtime-cu12==12.4.127
76
+ nvidia-cuda-nvrtc-cu12==12.4.127
77
+ nvidia-cuda-cupti-cu12==12.4.127
78
+ nvidia-cublas-cu12==12.4.5.8
79
+ numpy==2.3.3
80
+ ninja==1.13.0
81
+ networkx==3.5
82
+ nest-asyncio==1.6.0
83
+ narwhals==2.15.0
84
+ mypy_extensions==1.1.0
85
+ multidict==6.7.0
86
+ mistune==3.1.4
87
+ mdurl==0.1.2
88
+ MarkupSafe==3.0.3
89
+ lxml==6.0.2
90
+ librt==0.8.0
91
+ lark==1.3.0
92
+ kiwisolver==1.4.9
93
+ jupyterlab_widgets==3.0.15
94
+ jupyterlab_pygments==0.3.0
95
+ jsonpointer==3.0.0
96
+ json5==0.12.1
97
+ itsdangerous==2.2.0
98
+ idna==3.10
99
+ hf-xet==1.1.10
100
+ h11==0.16.0
101
+ groovy==0.1.2
102
+ fsspec==2025.9.0
103
+ frozenlist==1.8.0
104
+ fqdn==1.5.1
105
+ fonttools==4.60.1
106
+ filelock==3.19.1
107
+ ffmpy==1.0.0
108
+ executing==2.2.1
109
+ einops==0.8.1
110
+ dill==0.4.0
111
+ defusedxml==0.7.1
112
+ decorator==5.2.1
113
+ debugpy==1.8.17
114
+ dacite==1.9.2
115
+ cycler==0.12.1
116
+ comm==0.2.3
117
+ colorama==0.4.6
118
+ click==8.3.1
119
+ charset-normalizer==3.4.3
120
+ certifi==2025.10.5
121
+ bleach==6.2.0
122
+ babel==2.17.0
123
+ attrs==25.4.0
124
+ async-lru==2.0.5
125
+ asttokens==3.0.0
126
+ annotated-types==0.7.0
127
+ annotated-doc==0.0.4
128
+ aiohappyeyeballs==2.6.1
129
+ aiofiles==24.1.0
130
+ yarl==1.22.0
131
+ uvicorn==0.40.0
132
+ typing-inspection==0.4.2
133
+ terminado==0.18.1
134
+ stack-data==0.6.3
135
+ sentry-sdk==2.50.0
136
+ scipy==1.17.0
137
+ sacrebleu==2.6.0
138
+ rfc3987-syntax==1.1.0
139
+ rfc3339-validator==0.1.4
140
+ requests==2.32.5
141
+ reportlab==4.4.9
142
+ referencing==0.36.2
143
+ python-dateutil==2.9.0.post0
144
+ pydantic_core==2.41.5
145
+ prompt_toolkit==3.0.52
146
+ plotly==6.5.2
147
+ pathlib2==2.3.7.post1
148
+ orderedmultidict==1.0.2
149
+ optree==0.17.0
150
+ omegaconf==2.3.0
151
+ nvidia-cusparse-cu12==12.3.1.170
152
+ nvidia-cudnn-cu12==9.1.0.70
153
+ mypy==1.19.1
154
+ multiprocess==0.70.16
155
+ matplotlib-inline==0.1.7
156
+ markdown-it-py==4.0.0
157
+ jupyter_core==5.8.1
158
+ Jinja2==3.1.6
159
+ jedi==0.19.2
160
+ ipython_pygments_lexers==1.1.1
161
+ httpcore==1.0.9
162
+ gitdb==4.0.12
163
+ ftfy==6.3.1
164
+ contourpy==1.3.3
165
+ cffi==2.0.0
166
+ beautifulsoup4==4.14.2
167
+ anyio==4.11.0
168
+ aiosignal==1.4.0
169
+ starlette==0.50.0
170
+ rich==14.2.0
171
+ pydantic==2.12.5
172
+ pandas==2.3.3
173
+ nvidia-cusolver-cu12==11.6.1.9
174
+ matplotlib==3.10.7
175
+ jupyter_server_terminals==0.5.3
176
+ jupyter_client==8.6.3
177
+ jsonschema-specifications==2025.9.1
178
+ ipython==9.6.0
179
+ hydra-core==1.3.2
180
+ huggingface-hub==0.35.3
181
+ httpx==0.28.1
182
+ GitPython==3.1.46
183
+ furl==2.1.4
184
+ cryptography==46.0.4
185
+ arrow==1.3.0
186
+ argon2-cffi-bindings==25.1.0
187
+ aiohttp==3.13.1
188
+ wandb==0.24.0
189
+ typer==0.21.1
190
+ torch==2.6.0
191
+ tokenizers==0.22.1
192
+ seaborn==0.13.2
193
+ safehttpx==0.1.7
194
+ jsonschema==4.25.1
195
+ joypy==0.2.6
196
+ isoduration==20.11.0
197
+ ipywidgets==8.1.7
198
+ ipykernel==6.30.1
199
+ gradio_client==2.0.3
200
+ fastapi==0.128.0
201
+ Authlib==1.6.6
202
+ argon2-cffi==25.1.0
203
+ transformers==4.57.6
204
+ nbformat==5.10.4
205
+ mlstm_kernels==2.0.2
206
+ jupyter-console==6.6.3
207
+ gradio==6.5.1
208
+ datasets==4.3.0
209
+ clearml==1.16.4
210
+ accelerate==1.10.1
211
+ xlstm==2.0.4
212
+ nbclient==0.10.2
213
+ jupyter-events==0.12.0
214
+ trackio==0.15.0
215
+ nbconvert==7.16.6
216
+ jupyter_server==2.17.0
217
+ notebook_shim==0.2.4
218
+ jupyterlab_server==2.27.3
219
+ jupyter-lsp==2.3.0
220
+ nbclassic==1.3.3
221
+ jupyterlab==4.4.9
222
+ notebook==7.4.7
223
+ jupyter_contrib_core==0.4.2
224
+ jupyter==1.1.1
225
+ jupyter_nbextensions_configurator==0.6.4
226
+ causal-conv1d==1.5.0.post8
227
+ flash_attn==2.7.4.post1
228
+ mamba-ssm==2.2.4
229
+ hnet==0.0.1
230
+ autocommand==2.2.2
231
+ backports.tarfile==1.2.0
232
+ importlib_metadata==8.0.0
233
+ inflect==7.3.1
234
+ jaraco.collections==5.1.0
235
+ jaraco.context==5.3.0
236
+ jaraco.functools==4.0.1
237
+ jaraco.text==3.12.1
238
+ more-itertools==10.3.0
239
+ packaging==24.2
240
+ platformdirs==4.2.2
241
+ tomli==2.0.1
242
+ typeguard==4.3.0
243
+ typing_extensions==4.12.2
244
+ wheel==0.45.1
245
+ zipp==3.19.2
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/wandb-metadata.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-176-generic-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.12.0",
4
+ "startedAt": "2026-04-26T00:19:41.197293Z",
5
+ "args": [
6
+ "tracking=wandb",
7
+ "tracking.project=code-completion_lr-sweep",
8
+ "tracking.run_name=hnet_xl_code_lr_5e-4",
9
+ "training.lr=5e-4",
10
+ "paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4",
11
+ "data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full"
12
+ ],
13
+ "program": "/workspace/byte-llms-code/code_completion_exp/train_hnet/train.py",
14
+ "codePath": "code_completion_exp/train_hnet/train.py",
15
+ "codePathLocal": "train.py",
16
+ "git": {
17
+ "remote": "https://github.com/naryst/byte-llms-code.git",
18
+ "commit": "0a7180b6ab9f63d2794494f09ec4918576d10fa2"
19
+ },
20
+ "email": "nikita@local.ru",
21
+ "root": "/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4",
22
+ "host": "7504e518d24a",
23
+ "executable": "/venv/bytellm/bin/python",
24
+ "cpu_count": 64,
25
+ "cpu_count_logical": 128,
26
+ "gpu": "NVIDIA H100 80GB HBM3",
27
+ "gpu_count": 4,
28
+ "disk": {
29
+ "/": {
30
+ "total": "265214230528",
31
+ "used": "170465628160"
32
+ }
33
+ },
34
+ "memory": {
35
+ "total": "1081679683584"
36
+ },
37
+ "gpu_nvidia": [
38
+ {
39
+ "name": "NVIDIA H100 80GB HBM3",
40
+ "memoryTotal": "85520809984",
41
+ "cudaCores": 16896,
42
+ "architecture": "Hopper",
43
+ "uuid": "GPU-b60cdcab-2033-2009-41de-be646c953a20"
44
+ },
45
+ {
46
+ "name": "NVIDIA H100 80GB HBM3",
47
+ "memoryTotal": "85520809984",
48
+ "cudaCores": 16896,
49
+ "architecture": "Hopper",
50
+ "uuid": "GPU-9982b420-4520-4238-c378-ec5a46015474"
51
+ },
52
+ {
53
+ "name": "NVIDIA H100 80GB HBM3",
54
+ "memoryTotal": "85520809984",
55
+ "cudaCores": 16896,
56
+ "architecture": "Hopper",
57
+ "uuid": "GPU-e26ebaac-aaa6-3eed-17ab-a3dce303a76f"
58
+ },
59
+ {
60
+ "name": "NVIDIA H100 80GB HBM3",
61
+ "memoryTotal": "85520809984",
62
+ "cudaCores": 16896,
63
+ "architecture": "Hopper",
64
+ "uuid": "GPU-9dfc6dba-0be6-4a10-1027-336cc0e65134"
65
+ }
66
+ ],
67
+ "cudaVersion": "12.2",
68
+ "writerId": "dyau2sb185lrum72o3jj1efzj126ey29"
69
+ }
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/step_time":0.7018253087997437,"best/val_perplexity":1.3968818580448776,"_runtime":7135,"train/soft_boundary_ratio_stage0":0.3369161636344325,"epoch/hard_boundary_ratio_stage0":0.35071097942532414,"_timestamp":1.7771699152059996e+09,"train/loss_avg":0.35818642927241806,"epoch/chunk_len_stage1":2.6923843128064373,"train/lb_loss":1.0790050029754639,"best/val_loss":0.34870333082619165,"train/loss":0.24225624278187752,"epoch/lm_loss":0.3488054759618164,"val/lm_loss":0.33815049202669234,"epoch/soft_boundary_ratio_stage0":0.3369219771080528,"epoch/loss":0.3581828390180251,"train/hard_boundary_ratio_stage0":0.3507020525501328,"val/time":4.524768352508545,"train/lm_loss":0.2673550844192505,"best/step":8000,"train/chunk_len_stage1":2.692232057831329,"val/loss":0.34870333082619165,"epoch/soft_boundary_ratio_stage1":0.36771885260966747,"train/hard_boundary_ratio_stage1":0.3750683158667268,"epoch/lb_loss":1.0680555066867534,"epoch/chunk_len_stage0":2.873913584180546,"train/lr":5e-05,"train/epoch":1,"val/perplexity":1.3968818580448776,"val/lb_loss":1.0552839296204704,"_wandb":{"runtime":7135},"_step":9880,"train/chunk_len_stage0":2.8739849724727216,"epoch/time":7083.666541099548,"train/soft_boundary_ratio_stage1":0.3677366381526144,"epoch/hard_boundary_ratio_stage1":0.3750465785576219}
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug-core.log ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-26T00:19:41.280606372Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpj_w4xo0a/port-257611.txt","pid":257611,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-04-26T00:19:41.281310681Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":257611}
3
+ {"time":"2026-04-26T00:19:41.281295344Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-257611-257664-4135181244/socket","Net":"unix"}}
4
+ {"time":"2026-04-26T00:19:41.468587343Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-04-26T00:19:41.489622086Z","level":"INFO","msg":"handleInformInit: received","streamId":"i6lt8av0","id":"1(@)"}
6
+ {"time":"2026-04-26T00:19:41.862752556Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"i6lt8av0","id":"1(@)"}
7
+ {"time":"2026-04-26T02:18:39.038091992Z","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"i6lt8av0","id":"1(@)"}
8
+ {"time":"2026-04-26T02:18:39.041417993Z","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"i6lt8av0","id":"1(@)"}
9
+ {"time":"2026-04-26T02:18:39.05126405Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
10
+ {"time":"2026-04-26T02:18:39.051282767Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
11
+ {"time":"2026-04-26T02:18:39.051287843Z","level":"INFO","msg":"server is shutting down"}
12
+ {"time":"2026-04-26T02:18:39.051291659Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
13
+ {"time":"2026-04-26T02:18:39.051337316Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
14
+ {"time":"2026-04-26T02:18:39.051357455Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
15
+ {"time":"2026-04-26T02:18:39.051349453Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-257611-257664-4135181244/socket","Net":"unix"}}
16
+ {"time":"2026-04-26T02:18:39.051367174Z","level":"INFO","msg":"server is closed"}
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug-internal.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-04-26T00:19:41.489729754Z","level":"INFO","msg":"stream: starting","core version":"0.24.0"}
2
+ {"time":"2026-04-26T00:19:41.862602753Z","level":"INFO","msg":"stream: created new stream","id":"i6lt8av0"}
3
+ {"time":"2026-04-26T00:19:41.862649982Z","level":"INFO","msg":"handler: started","stream_id":"i6lt8av0"}
4
+ {"time":"2026-04-26T00:19:41.862747075Z","level":"INFO","msg":"stream: started","id":"i6lt8av0"}
5
+ {"time":"2026-04-26T00:19:41.862757167Z","level":"INFO","msg":"writer: started","stream_id":"i6lt8av0"}
6
+ {"time":"2026-04-26T00:19:41.862759475Z","level":"INFO","msg":"sender: started","stream_id":"i6lt8av0"}
7
+ {"time":"2026-04-26T00:19:41.985346468Z","level":"ERROR","msg":"git repo not found","error":"repository does not exist"}
8
+ {"time":"2026-04-26T02:18:38.932345179Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2026-04-26T02:18:39.035750391Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
+ {"time":"2026-04-26T02:18:39.038119733Z","level":"INFO","msg":"stream: closing","id":"i6lt8av0"}
11
+ {"time":"2026-04-26T02:18:39.038132189Z","level":"INFO","msg":"handler: closed","stream_id":"i6lt8av0"}
12
+ {"time":"2026-04-26T02:18:39.038193257Z","level":"INFO","msg":"sender: closed","stream_id":"i6lt8av0"}
13
+ {"time":"2026-04-26T02:18:39.038198898Z","level":"INFO","msg":"stream: closed","id":"i6lt8av0"}
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_setup.py:_flush():81] Current SDK version is 0.24.0
2
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_setup.py:_flush():81] Configure stats pid to 257611
3
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_setup.py:_flush():81] Loading settings from environment variables
4
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug.log
5
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/logs/debug-internal.log
6
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:init():844] calling init triggers
7
+ 2026-04-26 00:19:41,198 INFO MainThread:257611 [wandb_init.py:init():849] wandb.init called with sweep_config: {}
8
+ config: {'model': {'config_path': '/workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json', 'checkpoint_path': '/workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt'}, 'training': {'epochs': 1, 'batch_size': 4, 'eval_batch_size': 24, 'gradient_accumulation_steps': 4, 'lr': 0.0005, 'weight_decay': 0.1, 'betas': [0.9, 0.95], 'eps': 1e-08, 'lr_scheduler': 'wsd', 'warmup_ratio': 0.1, 'decay_ratio': 0.2, 'warmup_steps': 100, 'min_lr_ratio': 0.1, 'lr_multiplier': [2.0, 1.5, 1.0], 'load_balancing_weight': 0.01, 'load_balancing_N': 4.0, 'max_grad_norm': 1.0, 'use_amp': True, 'resume': False, 'resume_checkpoint': None, 'warmup_model': True}, 'data': {'path': '/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full', 'max_context_len': 4096, 'max_target_len': 256, 'num_workers': 0, 'pin_memory': True, 'max_train_samples': None, 'max_val_samples': 2000}, 'logging': {'log_interval': 10, 'save_interval': 0, 'eval_interval': 2000, 'save_every_epoch': False}, 'tracking': {'enabled': True, 'backend': 'wandb', 'project': 'code-completion_lr-sweep', 'run_name': 'hnet_xl_code_lr_5e-4', 'entity': None, 'base_url': 'https://wandb.platun0v.ru', 'local_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4'}, 'paths': {'output_dir': '/workspace/byte-llms-code/outputs/lr_sweep/hnet_xl_code_lr_5e-4'}, 'seed': 42, 'device': 'cuda', '_wandb': {'code_path': 'code/code_completion_exp/train_hnet/train.py'}}
9
+ 2026-04-26 00:19:41,199 INFO MainThread:257611 [wandb_init.py:init():892] starting backend
10
+ 2026-04-26 00:19:41,468 INFO MainThread:257611 [wandb_init.py:init():895] sending inform_init request
11
+ 2026-04-26 00:19:41,488 INFO MainThread:257611 [wandb_init.py:init():903] backend started and connected
12
+ 2026-04-26 00:19:41,491 INFO MainThread:257611 [wandb_init.py:init():973] updated telemetry
13
+ 2026-04-26 00:19:41,508 INFO MainThread:257611 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout
14
+ 2026-04-26 00:19:41,984 INFO MainThread:257611 [wandb_init.py:init():1044] starting run threads in backend
15
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_console_start():2529] atexit reg
16
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_redirect():2377] redirect: wrap_raw
17
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_redirect():2446] Wrapping output streams.
18
+ 2026-04-26 00:19:42,142 INFO MainThread:257611 [wandb_run.py:_redirect():2469] Redirects installed.
19
+ 2026-04-26 00:19:42,145 INFO MainThread:257611 [wandb_init.py:init():1084] run started, returning control to user process
20
+ 2026-04-26 02:18:37,753 INFO MainThread:257611 [wandb_run.py:_finish():2295] finishing run nikita/code-completion_lr-sweep/i6lt8av0
21
+ 2026-04-26 02:18:37,754 INFO MainThread:257611 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0
22
+ 2026-04-26 02:18:37,754 INFO MainThread:257611 [wandb_run.py:_restore():2476] restore
23
+ 2026-04-26 02:18:37,754 INFO MainThread:257611 [wandb_run.py:_restore():2482] restore done
24
+ 2026-04-26 02:18:39,037 INFO MainThread:257611 [wandb_run.py:_footer_sync_info():3870] logging synced files
lr_sweep/hnet_xl_code_lr_5e-4/wandb/run-20260426_001941-i6lt8av0/run-i6lt8av0.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4904ef01e885208188a41b5d7f41a4a1759d924be32288783eb848b49f89430b
3
+ size 3065241
lr_sweep/hnet_xl_code_lr_5e-5/model_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310965a6147f6c3900fdda4b4054876d929624a57099569b4c715c336af405ec
3
+ size 3315165139
lr_sweep/pythia_1b_lr_1e-4/.hydra/config.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ name: EleutherAI/pythia-1b
3
+ checkpoint_path: null
4
+ from_scratch: false
5
+ training:
6
+ epochs: 1
7
+ batch_size: 4
8
+ eval_batch_size: 12
9
+ gradient_accumulation_steps: 4
10
+ lr: 0.0001
11
+ weight_decay: 0.1
12
+ betas:
13
+ - 0.9
14
+ - 0.95
15
+ eps: 1.0e-08
16
+ lr_scheduler: wsd
17
+ warmup_ratio: 0.1
18
+ decay_ratio: 0.2
19
+ warmup_steps: 100
20
+ min_lr_ratio: 0.1
21
+ max_grad_norm: 1.0
22
+ use_amp: true
23
+ resume: false
24
+ resume_checkpoint: null
25
+ data:
26
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
27
+ max_context_len: 4096
28
+ max_target_len: 256
29
+ num_workers: 4
30
+ pin_memory: true
31
+ max_train_samples: null
32
+ max_val_samples: 2000
33
+ logging:
34
+ log_interval: 10
35
+ save_interval: 0
36
+ eval_interval: 2000
37
+ save_every_epoch: false
38
+ tracking:
39
+ enabled: true
40
+ backend: wandb
41
+ project: code-completion_lr-sweep
42
+ run_name: pythia_1b_lr_1e-4
43
+ entity: null
44
+ base_url: https://wandb.platun0v.ru
45
+ local_dir: ${paths.output_dir}
46
+ paths:
47
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/pythia_1b_lr_1e-4
48
+ seed: 42
49
+ device: cuda
lr_sweep/pythia_1b_lr_1e-4/.hydra/hydra.yaml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${paths.output_dir}
4
+ sweep:
5
+ dir: outputs/multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task:
115
+ - tracking=wandb
116
+ - tracking.project=code-completion_lr-sweep
117
+ - tracking.run_name=pythia_1b_lr_1e-4
118
+ - training.lr=1e-4
119
+ - paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/pythia_1b_lr_1e-4
120
+ - model=pythia_1b
121
+ - data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full
122
+ job:
123
+ name: train
124
+ chdir: false
125
+ override_dirname: data.path=/workspace/byte-llms-code/code_completion_exp/datasets/data_V4_full,model=pythia_1b,paths.output_dir=/workspace/byte-llms-code/outputs/lr_sweep/pythia_1b_lr_1e-4,tracking.project=code-completion_lr-sweep,tracking.run_name=pythia_1b_lr_1e-4,tracking=wandb,training.lr=1e-4
126
+ id: ???
127
+ num: ???
128
+ config_name: config
129
+ env_set: {}
130
+ env_copy: []
131
+ config:
132
+ override_dirname:
133
+ kv_sep: '='
134
+ item_sep: ','
135
+ exclude_keys: []
136
+ runtime:
137
+ version: 1.3.2
138
+ version_base: '1.3'
139
+ cwd: /workspace/byte-llms-code/code_completion_exp/train_pythia
140
+ config_sources:
141
+ - path: hydra.conf
142
+ schema: pkg
143
+ provider: hydra
144
+ - path: /workspace/byte-llms-code/code_completion_exp/train_pythia/configs
145
+ schema: file
146
+ provider: main
147
+ - path: ''
148
+ schema: structured
149
+ provider: schema
150
+ output_dir: /workspace/byte-llms-code/outputs/lr_sweep/pythia_1b_lr_1e-4
151
+ choices:
152
+ paths: default
153
+ tracking: wandb
154
+ logging: default
155
+ data: default
156
+ training: default
157
+ model: pythia_1b
158
+ hydra/env: default
159
+ hydra/callbacks: null
160
+ hydra/job_logging: default
161
+ hydra/hydra_logging: default
162
+ hydra/hydra_help: default
163
+ hydra/help: default
164
+ hydra/sweeper: basic
165
+ hydra/launcher: basic
166
+ hydra/output: default
167
+ verbose: false