narySt commited on
Commit
f7b8074
·
verified ·
1 Parent(s): e98d476

Add files using upload-large-folder tool

Browse files
train_hnet_with_docstring_18_04/checkpoints/checkpoint_latest.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:961e914acf254f75d3accdac545096cd7e66da917bf6c963bea9be50aa32f8ed
3
+ size 9945483438
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_10591.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ea9ea7277f4386806ae2b7d1b24fe9685a784ba1b67eb42de3289235b795f22
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_12000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8366292499e86c4d205f93253a7d2637fb2906de8e007d91b761df58d4b3e73
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_15000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e3552eccc68fefd58c733b162db0de202d60b76ae8e17478366e21a3e7ffa96
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_18000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42018b9f036477e626fce515690532aa077d1c047a0a0f10fd2ebb543129b7a4
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae7bba9fbba6448d718d6409b669077cdcc0f8a64bf7a212b50ce321e6c6f2c
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21182.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:040b60ed6ec60b823c58a2b90120741e53a3c56f57288f5dfeb7e60e1e665670
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_24000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e2b3de0508edc501ab43c173056ecde4fcdb1768eb1d9ec409ff6c5389deaea
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_27000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40892f5e0016e59f15f215e22cc48166c79f06b71171de4e21917e911efba4a3
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_3000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26d89d1f75945923fb330722f732782c4eca586acf469e4cd5e870e8b06dd039
3
+ size 9945490614
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_30000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823dd047ef1656da439ac9357b55e474a12794282d6ebe8f5d2a51139f8ce0ba
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_31773.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86d505541cd3af319175317281d89138ff0f0f9101224b2a323ea1de4edc5a8e
3
+ size 9945491982
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_6000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9201a7652769dbd11bb72591759a279f796a3fcd5a6bf5117e97727ce561bf5
3
+ size 9945490614
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_9000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:130d0d1c2703407af109e8dde865191f4968766ebf0dc68e42890363c8f9c43e
3
+ size 9945490614
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/code/code_completion_exp/train_hnet/train.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Pipeline для HNet модели на задаче Code Completion.
3
+
4
+ Конфигурация через Hydra + OmegaConf, логирование в Trackio.
5
+ Поддержка DDP через Accelerate для multi-GPU тренировки.
6
+
7
+ Использование:
8
+ # Базовый запуск (single GPU)
9
+ python train.py
10
+
11
+ # Multi-GPU с Accelerate
12
+ accelerate launch train.py
13
+
14
+ # Multi-GPU с указанием количества GPU
15
+ accelerate launch --num_processes=4 train.py
16
+
17
+ # Переопределение параметров через CLI
18
+ python train.py training.lr=1e-4 training.epochs=5
19
+
20
+ # Выбор другого конфига модели
21
+ python train.py model=hnet_small
22
+
23
+ # Multirun (sweep)
24
+ python train.py --multirun training.lr=1e-4,3e-4,1e-3
25
+
26
+ # Без логирования
27
+ python train.py tracking.enabled=false
28
+ """
29
+
30
+ import os
31
+ import math
32
+ from pathlib import Path
33
+
34
+ import torch
35
+ import hydra
36
+ from hydra.core.hydra_config import HydraConfig
37
+ from omegaconf import DictConfig, OmegaConf
38
+ from accelerate import Accelerator
39
+ from accelerate.utils import set_seed as accelerate_set_seed
40
+
41
+ # HNet imports
42
+ from hnet.load_utils import load_from_pretrained, load_from_config
43
+ from hnet.utils.tokenizers import ByteTokenizer
44
+ from hnet.utils.train import group_params
45
+
46
+ # Ensure repo root is on sys.path (needed when running from subdirectory)
47
+ import sys
48
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
49
+
50
+ # Shared training library
51
+ from training_lib.utils import log_message
52
+ from training_lib.checkpointing import save_checkpoint, load_checkpoint
53
+ from training_lib.schedulers import get_lr_scheduler
54
+ from training_lib.tracking import init_tracking, finish_tracking
55
+ from training_lib.hnet.train_loop import train_epoch
56
+ from training_lib.hnet.data import create_dataloaders
57
+
58
+
59
+ @hydra.main(version_base=None, config_path="configs", config_name="config")
60
+ def main(cfg: DictConfig):
61
+ """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
62
+
63
+ # === Accelerator Setup ===
64
+ mixed_precision = "bf16" if cfg.training.use_amp else "no"
65
+
66
+ accelerator = Accelerator(
67
+ mixed_precision=mixed_precision,
68
+ gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
69
+ )
70
+
71
+ # === Setup ===
72
+ accelerate_set_seed(cfg.seed)
73
+
74
+ if cfg.paths.output_dir is None:
75
+ cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
76
+
77
+ OmegaConf.resolve(cfg)
78
+
79
+ log_message(
80
+ f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
81
+ cfg,
82
+ accelerator,
83
+ )
84
+ log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
85
+ log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
86
+ log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
87
+
88
+ log_message("=" * 60, cfg, accelerator)
89
+ log_message(
90
+ "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
91
+ )
92
+ log_message("=" * 60, cfg, accelerator)
93
+ log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
94
+
95
+ # === Trackio Init ===
96
+ init_tracking(cfg, accelerator)
97
+
98
+ # === Tokenizer ===
99
+ log_message("Initializing tokenizer...", cfg, accelerator)
100
+ tokenizer = ByteTokenizer()
101
+
102
+ # === Model ===
103
+ log_message("Loading model...", cfg, accelerator)
104
+ if cfg.model.checkpoint_path:
105
+ model = load_from_pretrained(
106
+ model_path=cfg.model.checkpoint_path,
107
+ model_config_path=cfg.model.config_path,
108
+ )
109
+ log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
110
+ else:
111
+ model = load_from_config(
112
+ model_config_path=cfg.model.config_path,
113
+ device="cpu",
114
+ )
115
+ model.init_weights()
116
+ log_message("Initialized from scratch", cfg, accelerator)
117
+
118
+ model.train()
119
+
120
+ # LR multiplier для разны�� стадий (до prepare!)
121
+ lr_multiplier = list(cfg.training.lr_multiplier)
122
+ model.apply_lr_multiplier(lr_multiplier)
123
+ log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
124
+
125
+ # Warmup для Triton kernels
126
+ if cfg.training.warmup_model:
127
+ log_message("Warming up model...", cfg, accelerator)
128
+ model = model.to(accelerator.device)
129
+ model.warmup(verbose=accelerator.is_main_process)
130
+
131
+ # Log model info
132
+ total_params = sum(p.numel() for p in model.parameters())
133
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
134
+ log_message(f"Total params: {total_params:,}", cfg, accelerator)
135
+ log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
136
+
137
+ # === Data ===
138
+ log_message("Creating dataloaders...", cfg, accelerator)
139
+ dataloaders = create_dataloaders(cfg, tokenizer)
140
+
141
+ train_dataloader = dataloaders["train"]
142
+ val_dataloader = dataloaders.get("validation", None)
143
+
144
+ log_message(
145
+ f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
146
+ )
147
+ log_message(
148
+ f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
149
+ cfg,
150
+ accelerator,
151
+ )
152
+
153
+ if val_dataloader:
154
+ log_message(
155
+ f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
156
+ )
157
+ log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
158
+ else:
159
+ log_message("No validation dataset found", cfg, accelerator)
160
+
161
+ # === Optimizer ===
162
+ log_message("Creating optimizer...", cfg, accelerator)
163
+ param_groups = group_params(model)
164
+
165
+ for group in param_groups:
166
+ if "lr" not in group:
167
+ group["lr"] = cfg.training.lr
168
+ else:
169
+ group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
170
+ if "weight_decay" not in group:
171
+ group["weight_decay"] = cfg.training.weight_decay
172
+
173
+ optimizer = torch.optim.AdamW(
174
+ param_groups,
175
+ lr=cfg.training.lr,
176
+ betas=tuple(cfg.training.betas),
177
+ eps=cfg.training.eps,
178
+ )
179
+
180
+ # === Scheduler ===
181
+ steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
182
+ total_steps = (
183
+ cfg.training.epochs
184
+ * steps_per_epoch
185
+ // cfg.training.gradient_accumulation_steps
186
+ )
187
+ scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
188
+
189
+ log_message(
190
+ f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
191
+ cfg,
192
+ accelerator,
193
+ )
194
+
195
+ # === Accelerate Prepare ===
196
+ log_message(
197
+ "Preparing model, optimizer, and dataloaders with Accelerate...",
198
+ cfg,
199
+ accelerator,
200
+ )
201
+
202
+ if val_dataloader is not None:
203
+ model, optimizer, train_dataloader, val_dataloader, scheduler = (
204
+ accelerator.prepare(
205
+ model, optimizer, train_dataloader, val_dataloader, scheduler
206
+ )
207
+ )
208
+ else:
209
+ model, optimizer, train_dataloader, scheduler = accelerator.prepare(
210
+ model, optimizer, train_dataloader, scheduler
211
+ )
212
+
213
+ log_message(
214
+ f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
215
+ cfg,
216
+ accelerator,
217
+ )
218
+
219
+ # === Resume ===
220
+ global_step = 0
221
+ start_epoch = 1
222
+
223
+ if cfg.training.resume and cfg.training.resume_checkpoint:
224
+ global_step, start_epoch = load_checkpoint(
225
+ model,
226
+ optimizer,
227
+ scheduler,
228
+ cfg.training.resume_checkpoint,
229
+ cfg,
230
+ accelerator,
231
+ )
232
+ start_epoch += 1
233
+
234
+ # === Training Loop ===
235
+ log_message("Starting training...", cfg, accelerator)
236
+
237
+ best_val_loss = float("inf")
238
+
239
+ try:
240
+ for epoch in range(start_epoch, cfg.training.epochs + 1):
241
+ log_message(f"\n{'=' * 60}", cfg, accelerator)
242
+ log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
243
+ log_message(f"{'=' * 60}", cfg, accelerator)
244
+
245
+ global_step, best_val_loss = train_epoch(
246
+ model=model,
247
+ dataloader=train_dataloader,
248
+ optimizer=optimizer,
249
+ scheduler=scheduler,
250
+ cfg=cfg,
251
+ epoch=epoch,
252
+ global_step=global_step,
253
+ accelerator=accelerator,
254
+ val_dataloader=val_dataloader,
255
+ best_val_loss=best_val_loss,
256
+ )
257
+
258
+ if cfg.logging.save_every_epoch:
259
+ save_checkpoint(
260
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
261
+ )
262
+
263
+ except KeyboardInterrupt:
264
+ log_message("Training interrupted by user", cfg, accelerator)
265
+ save_checkpoint(
266
+ model, optimizer, scheduler, global_step, epoch, cfg, accelerator
267
+ )
268
+
269
+ # === Final Save ===
270
+ log_message("\nTraining completed!", cfg, accelerator)
271
+
272
+ if accelerator.is_main_process:
273
+ final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
274
+ unwrapped_model = accelerator.unwrap_model(model)
275
+ torch.save(unwrapped_model.state_dict(), final_model_path)
276
+ log_message(f"Final model: {final_model_path}", cfg, accelerator)
277
+
278
+ accelerator.wait_for_everyone()
279
+ accelerator.end_training()
280
+ finish_tracking()
281
+
282
+
283
+ if __name__ == "__main__":
284
+ main()
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/config.yaml ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ code_path: code/code_completion_exp/train_hnet/train.py
7
+ python_version: 3.12.0
8
+ cli_version: 0.24.0
9
+ framework: huggingface
10
+ huggingface_version: 4.57.6
11
+ is_jupyter_run: false
12
+ is_kaggle_kernel: false
13
+ start_time: 1776416277
14
+ t:
15
+ 1:
16
+ - 1
17
+ - 11
18
+ - 49
19
+ - 50
20
+ - 51
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 11
26
+ - 49
27
+ - 50
28
+ - 51
29
+ - 71
30
+ - 105
31
+ 3:
32
+ - 2
33
+ - 13
34
+ - 16
35
+ - 37
36
+ - 42
37
+ - 61
38
+ 4: 3.12.0
39
+ 5: 0.24.0
40
+ 6: 4.57.6
41
+ 13: linux-x86_64
42
+ e:
43
+ 1enfm68bplbg421e1aqnc3guby2j3hk6:
44
+ os: Linux-5.15.0-173-generic-x86_64-with-glibc2.39
45
+ python: CPython 3.12.0
46
+ started_at: '2026-04-17T08:57:57.464191Z'
47
+ program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
48
+ code_path: code_completion_exp/train_hnet/train.py
49
+ code_path_local: train.py
50
+ git:
51
+ remote_url: https://github.com/naryst/byte-llms-code.git
52
+ commit: 056a135fbb34bc28ed3adfeeb2f4ac97cbf12a89
53
+ email: nikita@local.ru
54
+ root: /workspace/byte-llms-code/code_completion_exp/train_hnet
55
+ host: 3e675e030992
56
+ executable: /venv/bytellm/bin/python
57
+ cpu_count: 112
58
+ cpu_count_logical: 224
59
+ gpu_type: NVIDIA H100 80GB HBM3
60
+ gpu_count: 2
61
+ disk:
62
+ /:
63
+ total: '244813135872'
64
+ used: '36382741504'
65
+ memory:
66
+ total: '1622968434688'
67
+ gpu_nvidia:
68
+ - name: NVIDIA H100 80GB HBM3
69
+ memory_total: '85520809984'
70
+ cuda_cores: 16896
71
+ architecture: Hopper
72
+ uuid: GPU-3c87d2f8-c595-49bd-bb1d-1ebfd19c6fb0
73
+ - name: NVIDIA H100 80GB HBM3
74
+ memory_total: '85520809984'
75
+ cuda_cores: 16896
76
+ architecture: Hopper
77
+ uuid: GPU-beb9a6b0-ebef-1f4c-d886-465c96f57ca4
78
+ cuda_version: '12.9'
79
+ writer_id: 1enfm68bplbg421e1aqnc3guby2j3hk6
80
+ model:
81
+ desc: null
82
+ value:
83
+ config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
84
+ checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
85
+ training:
86
+ desc: null
87
+ value:
88
+ epochs: 3
89
+ batch_size: 4
90
+ eval_batch_size: 24
91
+ gradient_accumulation_steps: 4
92
+ lr: 0.0001
93
+ weight_decay: 0.1
94
+ betas:
95
+ - 0.9
96
+ - 0.95
97
+ eps: 1.0e-08
98
+ lr_scheduler: wsd
99
+ warmup_ratio: 0.1
100
+ decay_ratio: 0.2
101
+ warmup_steps: 100
102
+ min_lr_ratio: 0.1
103
+ lr_multiplier:
104
+ - 2.0
105
+ - 1.5
106
+ - 1.0
107
+ load_balancing_weight: 0.01
108
+ load_balancing_N: 4.0
109
+ max_grad_norm: 1.0
110
+ use_amp: true
111
+ resume: false
112
+ resume_checkpoint: null
113
+ warmup_model: true
114
+ data:
115
+ desc: null
116
+ value:
117
+ path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V5_full
118
+ max_context_len: 4096
119
+ max_target_len: 256
120
+ num_workers: 0
121
+ pin_memory: true
122
+ max_train_samples: null
123
+ max_val_samples: null
124
+ logging:
125
+ desc: null
126
+ value:
127
+ log_interval: 10
128
+ save_interval: 3000
129
+ eval_interval: 1000
130
+ save_every_epoch: true
131
+ tracking:
132
+ desc: null
133
+ value:
134
+ enabled: true
135
+ backend: wandb
136
+ project: code-completion-full-docstring
137
+ run_name: hnet_train
138
+ entity: null
139
+ base_url: https://wandb.platun0v.ru
140
+ paths:
141
+ desc: null
142
+ value:
143
+ output_dir: outputs/2026-04-17/08-57-56
144
+ seed:
145
+ desc: null
146
+ value: 42
147
+ device:
148
+ desc: null
149
+ value: cuda
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/output.log ADDED
The diff for this file is too large to render. See raw diff
 
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime": 24431, "train/loss": 0.2280647549778223, "train/loss_avg": 0.20581580642644212, "train/lb_loss": 1.0840826034545898, "train/epoch": 3, "train/lm_loss": 0.17879723012447357, "train/lr": 1e-05, "train/step_time": 0.6833733320236206, "train/chunk_len_stage0": 2.8951674623156975, "train/hard_boundary_ratio_stage1": 0.4156030081584442, "train/soft_boundary_ratio_stage0": 0.3467458609387149, "train/chunk_len_stage1": 2.415668312621829, "train/hard_boundary_ratio_stage0": 0.3473884895159008, "train/soft_boundary_ratio_stage1": 0.38436046590206835, "_timestamp": 1776440679.1860769, "_step": 31770, "best/val_loss": 0.3120614947675138, "val/perplexity": 1.3806528571733947, "val/loss": 0.3327806241316151, "best/val_perplexity": 1.352240898860508, "best/step": 10000, "val/lm_loss": 0.3219491058048241, "val/lb_loss": 1.0831518805756861, "val/time": 107.17485237121582, "epoch/lm_loss": 0.19445337861685316, "epoch/chunk_len_stage1": 2.4157022583211734, "epoch/time": 7926.80414223671, "epoch/chunk_len_stage0": 2.895147230253075, "epoch/hard_boundary_ratio_stage0": 0.34739050784790954, "epoch/soft_boundary_ratio_stage1": 0.38435656324014733, "epoch/lb_loss": 1.0874687482092606, "epoch/hard_boundary_ratio_stage1": 0.41559792626858155, "epoch/soft_boundary_ratio_stage0": 0.3467473082940784, "epoch/loss": 0.20583635369858655}