Add files using upload-large-folder tool
Browse files- train_hnet_with_docstring_18_04/checkpoints/checkpoint_latest.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_10591.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_12000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_15000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_18000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21182.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_24000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_27000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_3000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_30000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_31773.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_6000.pt +3 -0
- train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_9000.pt +3 -0
- train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/code/code_completion_exp/train_hnet/train.py +284 -0
- train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/config.yaml +149 -0
- train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/output.log +0 -0
- train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/wandb-summary.json +1 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_latest.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:961e914acf254f75d3accdac545096cd7e66da917bf6c963bea9be50aa32f8ed
|
| 3 |
+
size 9945483438
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_10591.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ea9ea7277f4386806ae2b7d1b24fe9685a784ba1b67eb42de3289235b795f22
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_12000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8366292499e86c4d205f93253a7d2637fb2906de8e007d91b761df58d4b3e73
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_15000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e3552eccc68fefd58c733b162db0de202d60b76ae8e17478366e21a3e7ffa96
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_18000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42018b9f036477e626fce515690532aa077d1c047a0a0f10fd2ebb543129b7a4
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ae7bba9fbba6448d718d6409b669077cdcc0f8a64bf7a212b50ce321e6c6f2c
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21182.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:040b60ed6ec60b823c58a2b90120741e53a3c56f57288f5dfeb7e60e1e665670
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_24000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e2b3de0508edc501ab43c173056ecde4fcdb1768eb1d9ec409ff6c5389deaea
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_27000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:40892f5e0016e59f15f215e22cc48166c79f06b71171de4e21917e911efba4a3
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_3000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:26d89d1f75945923fb330722f732782c4eca586acf469e4cd5e870e8b06dd039
|
| 3 |
+
size 9945490614
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_30000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:823dd047ef1656da439ac9357b55e474a12794282d6ebe8f5d2a51139f8ce0ba
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_31773.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86d505541cd3af319175317281d89138ff0f0f9101224b2a323ea1de4edc5a8e
|
| 3 |
+
size 9945491982
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_6000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9201a7652769dbd11bb72591759a279f796a3fcd5a6bf5117e97727ce561bf5
|
| 3 |
+
size 9945490614
|
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_9000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:130d0d1c2703407af109e8dde865191f4968766ebf0dc68e42890363c8f9c43e
|
| 3 |
+
size 9945490614
|
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/code/code_completion_exp/train_hnet/train.py
ADDED
|
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Training Pipeline для HNet модели на задаче Code Completion.
|
| 3 |
+
|
| 4 |
+
Конфигурация через Hydra + OmegaConf, логирование в Trackio.
|
| 5 |
+
Поддержка DDP через Accelerate для multi-GPU тренировки.
|
| 6 |
+
|
| 7 |
+
Использование:
|
| 8 |
+
# Базовый запуск (single GPU)
|
| 9 |
+
python train.py
|
| 10 |
+
|
| 11 |
+
# Multi-GPU с Accelerate
|
| 12 |
+
accelerate launch train.py
|
| 13 |
+
|
| 14 |
+
# Multi-GPU с указанием количества GPU
|
| 15 |
+
accelerate launch --num_processes=4 train.py
|
| 16 |
+
|
| 17 |
+
# Переопределение параметров через CLI
|
| 18 |
+
python train.py training.lr=1e-4 training.epochs=5
|
| 19 |
+
|
| 20 |
+
# Выбор другого конфига модели
|
| 21 |
+
python train.py model=hnet_small
|
| 22 |
+
|
| 23 |
+
# Multirun (sweep)
|
| 24 |
+
python train.py --multirun training.lr=1e-4,3e-4,1e-3
|
| 25 |
+
|
| 26 |
+
# Без логирования
|
| 27 |
+
python train.py tracking.enabled=false
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
import os
|
| 31 |
+
import math
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
|
| 34 |
+
import torch
|
| 35 |
+
import hydra
|
| 36 |
+
from hydra.core.hydra_config import HydraConfig
|
| 37 |
+
from omegaconf import DictConfig, OmegaConf
|
| 38 |
+
from accelerate import Accelerator
|
| 39 |
+
from accelerate.utils import set_seed as accelerate_set_seed
|
| 40 |
+
|
| 41 |
+
# HNet imports
|
| 42 |
+
from hnet.load_utils import load_from_pretrained, load_from_config
|
| 43 |
+
from hnet.utils.tokenizers import ByteTokenizer
|
| 44 |
+
from hnet.utils.train import group_params
|
| 45 |
+
|
| 46 |
+
# Ensure repo root is on sys.path (needed when running from subdirectory)
|
| 47 |
+
import sys
|
| 48 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
| 49 |
+
|
| 50 |
+
# Shared training library
|
| 51 |
+
from training_lib.utils import log_message
|
| 52 |
+
from training_lib.checkpointing import save_checkpoint, load_checkpoint
|
| 53 |
+
from training_lib.schedulers import get_lr_scheduler
|
| 54 |
+
from training_lib.tracking import init_tracking, finish_tracking
|
| 55 |
+
from training_lib.hnet.train_loop import train_epoch
|
| 56 |
+
from training_lib.hnet.data import create_dataloaders
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@hydra.main(version_base=None, config_path="configs", config_name="config")
|
| 60 |
+
def main(cfg: DictConfig):
|
| 61 |
+
"""Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
|
| 62 |
+
|
| 63 |
+
# === Accelerator Setup ===
|
| 64 |
+
mixed_precision = "bf16" if cfg.training.use_amp else "no"
|
| 65 |
+
|
| 66 |
+
accelerator = Accelerator(
|
| 67 |
+
mixed_precision=mixed_precision,
|
| 68 |
+
gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# === Setup ===
|
| 72 |
+
accelerate_set_seed(cfg.seed)
|
| 73 |
+
|
| 74 |
+
if cfg.paths.output_dir is None:
|
| 75 |
+
cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
|
| 76 |
+
|
| 77 |
+
OmegaConf.resolve(cfg)
|
| 78 |
+
|
| 79 |
+
log_message(
|
| 80 |
+
f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
|
| 81 |
+
cfg,
|
| 82 |
+
accelerator,
|
| 83 |
+
)
|
| 84 |
+
log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
|
| 85 |
+
log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
|
| 86 |
+
log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
|
| 87 |
+
|
| 88 |
+
log_message("=" * 60, cfg, accelerator)
|
| 89 |
+
log_message(
|
| 90 |
+
"HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
|
| 91 |
+
)
|
| 92 |
+
log_message("=" * 60, cfg, accelerator)
|
| 93 |
+
log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
|
| 94 |
+
|
| 95 |
+
# === Trackio Init ===
|
| 96 |
+
init_tracking(cfg, accelerator)
|
| 97 |
+
|
| 98 |
+
# === Tokenizer ===
|
| 99 |
+
log_message("Initializing tokenizer...", cfg, accelerator)
|
| 100 |
+
tokenizer = ByteTokenizer()
|
| 101 |
+
|
| 102 |
+
# === Model ===
|
| 103 |
+
log_message("Loading model...", cfg, accelerator)
|
| 104 |
+
if cfg.model.checkpoint_path:
|
| 105 |
+
model = load_from_pretrained(
|
| 106 |
+
model_path=cfg.model.checkpoint_path,
|
| 107 |
+
model_config_path=cfg.model.config_path,
|
| 108 |
+
)
|
| 109 |
+
log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
|
| 110 |
+
else:
|
| 111 |
+
model = load_from_config(
|
| 112 |
+
model_config_path=cfg.model.config_path,
|
| 113 |
+
device="cpu",
|
| 114 |
+
)
|
| 115 |
+
model.init_weights()
|
| 116 |
+
log_message("Initialized from scratch", cfg, accelerator)
|
| 117 |
+
|
| 118 |
+
model.train()
|
| 119 |
+
|
| 120 |
+
# LR multiplier для разны�� стадий (до prepare!)
|
| 121 |
+
lr_multiplier = list(cfg.training.lr_multiplier)
|
| 122 |
+
model.apply_lr_multiplier(lr_multiplier)
|
| 123 |
+
log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
|
| 124 |
+
|
| 125 |
+
# Warmup для Triton kernels
|
| 126 |
+
if cfg.training.warmup_model:
|
| 127 |
+
log_message("Warming up model...", cfg, accelerator)
|
| 128 |
+
model = model.to(accelerator.device)
|
| 129 |
+
model.warmup(verbose=accelerator.is_main_process)
|
| 130 |
+
|
| 131 |
+
# Log model info
|
| 132 |
+
total_params = sum(p.numel() for p in model.parameters())
|
| 133 |
+
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 134 |
+
log_message(f"Total params: {total_params:,}", cfg, accelerator)
|
| 135 |
+
log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
|
| 136 |
+
|
| 137 |
+
# === Data ===
|
| 138 |
+
log_message("Creating dataloaders...", cfg, accelerator)
|
| 139 |
+
dataloaders = create_dataloaders(cfg, tokenizer)
|
| 140 |
+
|
| 141 |
+
train_dataloader = dataloaders["train"]
|
| 142 |
+
val_dataloader = dataloaders.get("validation", None)
|
| 143 |
+
|
| 144 |
+
log_message(
|
| 145 |
+
f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
|
| 146 |
+
)
|
| 147 |
+
log_message(
|
| 148 |
+
f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
|
| 149 |
+
cfg,
|
| 150 |
+
accelerator,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
if val_dataloader:
|
| 154 |
+
log_message(
|
| 155 |
+
f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
|
| 156 |
+
)
|
| 157 |
+
log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
|
| 158 |
+
else:
|
| 159 |
+
log_message("No validation dataset found", cfg, accelerator)
|
| 160 |
+
|
| 161 |
+
# === Optimizer ===
|
| 162 |
+
log_message("Creating optimizer...", cfg, accelerator)
|
| 163 |
+
param_groups = group_params(model)
|
| 164 |
+
|
| 165 |
+
for group in param_groups:
|
| 166 |
+
if "lr" not in group:
|
| 167 |
+
group["lr"] = cfg.training.lr
|
| 168 |
+
else:
|
| 169 |
+
group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
|
| 170 |
+
if "weight_decay" not in group:
|
| 171 |
+
group["weight_decay"] = cfg.training.weight_decay
|
| 172 |
+
|
| 173 |
+
optimizer = torch.optim.AdamW(
|
| 174 |
+
param_groups,
|
| 175 |
+
lr=cfg.training.lr,
|
| 176 |
+
betas=tuple(cfg.training.betas),
|
| 177 |
+
eps=cfg.training.eps,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
# === Scheduler ===
|
| 181 |
+
steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
|
| 182 |
+
total_steps = (
|
| 183 |
+
cfg.training.epochs
|
| 184 |
+
* steps_per_epoch
|
| 185 |
+
// cfg.training.gradient_accumulation_steps
|
| 186 |
+
)
|
| 187 |
+
scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
|
| 188 |
+
|
| 189 |
+
log_message(
|
| 190 |
+
f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
|
| 191 |
+
cfg,
|
| 192 |
+
accelerator,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# === Accelerate Prepare ===
|
| 196 |
+
log_message(
|
| 197 |
+
"Preparing model, optimizer, and dataloaders with Accelerate...",
|
| 198 |
+
cfg,
|
| 199 |
+
accelerator,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
if val_dataloader is not None:
|
| 203 |
+
model, optimizer, train_dataloader, val_dataloader, scheduler = (
|
| 204 |
+
accelerator.prepare(
|
| 205 |
+
model, optimizer, train_dataloader, val_dataloader, scheduler
|
| 206 |
+
)
|
| 207 |
+
)
|
| 208 |
+
else:
|
| 209 |
+
model, optimizer, train_dataloader, scheduler = accelerator.prepare(
|
| 210 |
+
model, optimizer, train_dataloader, scheduler
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
log_message(
|
| 214 |
+
f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
|
| 215 |
+
cfg,
|
| 216 |
+
accelerator,
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# === Resume ===
|
| 220 |
+
global_step = 0
|
| 221 |
+
start_epoch = 1
|
| 222 |
+
|
| 223 |
+
if cfg.training.resume and cfg.training.resume_checkpoint:
|
| 224 |
+
global_step, start_epoch = load_checkpoint(
|
| 225 |
+
model,
|
| 226 |
+
optimizer,
|
| 227 |
+
scheduler,
|
| 228 |
+
cfg.training.resume_checkpoint,
|
| 229 |
+
cfg,
|
| 230 |
+
accelerator,
|
| 231 |
+
)
|
| 232 |
+
start_epoch += 1
|
| 233 |
+
|
| 234 |
+
# === Training Loop ===
|
| 235 |
+
log_message("Starting training...", cfg, accelerator)
|
| 236 |
+
|
| 237 |
+
best_val_loss = float("inf")
|
| 238 |
+
|
| 239 |
+
try:
|
| 240 |
+
for epoch in range(start_epoch, cfg.training.epochs + 1):
|
| 241 |
+
log_message(f"\n{'=' * 60}", cfg, accelerator)
|
| 242 |
+
log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
|
| 243 |
+
log_message(f"{'=' * 60}", cfg, accelerator)
|
| 244 |
+
|
| 245 |
+
global_step, best_val_loss = train_epoch(
|
| 246 |
+
model=model,
|
| 247 |
+
dataloader=train_dataloader,
|
| 248 |
+
optimizer=optimizer,
|
| 249 |
+
scheduler=scheduler,
|
| 250 |
+
cfg=cfg,
|
| 251 |
+
epoch=epoch,
|
| 252 |
+
global_step=global_step,
|
| 253 |
+
accelerator=accelerator,
|
| 254 |
+
val_dataloader=val_dataloader,
|
| 255 |
+
best_val_loss=best_val_loss,
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
if cfg.logging.save_every_epoch:
|
| 259 |
+
save_checkpoint(
|
| 260 |
+
model, optimizer, scheduler, global_step, epoch, cfg, accelerator
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
except KeyboardInterrupt:
|
| 264 |
+
log_message("Training interrupted by user", cfg, accelerator)
|
| 265 |
+
save_checkpoint(
|
| 266 |
+
model, optimizer, scheduler, global_step, epoch, cfg, accelerator
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# === Final Save ===
|
| 270 |
+
log_message("\nTraining completed!", cfg, accelerator)
|
| 271 |
+
|
| 272 |
+
if accelerator.is_main_process:
|
| 273 |
+
final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
|
| 274 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
| 275 |
+
torch.save(unwrapped_model.state_dict(), final_model_path)
|
| 276 |
+
log_message(f"Final model: {final_model_path}", cfg, accelerator)
|
| 277 |
+
|
| 278 |
+
accelerator.wait_for_everyone()
|
| 279 |
+
accelerator.end_training()
|
| 280 |
+
finish_tracking()
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
if __name__ == "__main__":
|
| 284 |
+
main()
|
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/config.yaml
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
wandb_version: 1
|
| 2 |
+
|
| 3 |
+
_wandb:
|
| 4 |
+
desc: null
|
| 5 |
+
value:
|
| 6 |
+
code_path: code/code_completion_exp/train_hnet/train.py
|
| 7 |
+
python_version: 3.12.0
|
| 8 |
+
cli_version: 0.24.0
|
| 9 |
+
framework: huggingface
|
| 10 |
+
huggingface_version: 4.57.6
|
| 11 |
+
is_jupyter_run: false
|
| 12 |
+
is_kaggle_kernel: false
|
| 13 |
+
start_time: 1776416277
|
| 14 |
+
t:
|
| 15 |
+
1:
|
| 16 |
+
- 1
|
| 17 |
+
- 11
|
| 18 |
+
- 49
|
| 19 |
+
- 50
|
| 20 |
+
- 51
|
| 21 |
+
- 71
|
| 22 |
+
- 105
|
| 23 |
+
2:
|
| 24 |
+
- 1
|
| 25 |
+
- 11
|
| 26 |
+
- 49
|
| 27 |
+
- 50
|
| 28 |
+
- 51
|
| 29 |
+
- 71
|
| 30 |
+
- 105
|
| 31 |
+
3:
|
| 32 |
+
- 2
|
| 33 |
+
- 13
|
| 34 |
+
- 16
|
| 35 |
+
- 37
|
| 36 |
+
- 42
|
| 37 |
+
- 61
|
| 38 |
+
4: 3.12.0
|
| 39 |
+
5: 0.24.0
|
| 40 |
+
6: 4.57.6
|
| 41 |
+
13: linux-x86_64
|
| 42 |
+
e:
|
| 43 |
+
1enfm68bplbg421e1aqnc3guby2j3hk6:
|
| 44 |
+
os: Linux-5.15.0-173-generic-x86_64-with-glibc2.39
|
| 45 |
+
python: CPython 3.12.0
|
| 46 |
+
started_at: '2026-04-17T08:57:57.464191Z'
|
| 47 |
+
program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
|
| 48 |
+
code_path: code_completion_exp/train_hnet/train.py
|
| 49 |
+
code_path_local: train.py
|
| 50 |
+
git:
|
| 51 |
+
remote_url: https://github.com/naryst/byte-llms-code.git
|
| 52 |
+
commit: 056a135fbb34bc28ed3adfeeb2f4ac97cbf12a89
|
| 53 |
+
email: nikita@local.ru
|
| 54 |
+
root: /workspace/byte-llms-code/code_completion_exp/train_hnet
|
| 55 |
+
host: 3e675e030992
|
| 56 |
+
executable: /venv/bytellm/bin/python
|
| 57 |
+
cpu_count: 112
|
| 58 |
+
cpu_count_logical: 224
|
| 59 |
+
gpu_type: NVIDIA H100 80GB HBM3
|
| 60 |
+
gpu_count: 2
|
| 61 |
+
disk:
|
| 62 |
+
/:
|
| 63 |
+
total: '244813135872'
|
| 64 |
+
used: '36382741504'
|
| 65 |
+
memory:
|
| 66 |
+
total: '1622968434688'
|
| 67 |
+
gpu_nvidia:
|
| 68 |
+
- name: NVIDIA H100 80GB HBM3
|
| 69 |
+
memory_total: '85520809984'
|
| 70 |
+
cuda_cores: 16896
|
| 71 |
+
architecture: Hopper
|
| 72 |
+
uuid: GPU-3c87d2f8-c595-49bd-bb1d-1ebfd19c6fb0
|
| 73 |
+
- name: NVIDIA H100 80GB HBM3
|
| 74 |
+
memory_total: '85520809984'
|
| 75 |
+
cuda_cores: 16896
|
| 76 |
+
architecture: Hopper
|
| 77 |
+
uuid: GPU-beb9a6b0-ebef-1f4c-d886-465c96f57ca4
|
| 78 |
+
cuda_version: '12.9'
|
| 79 |
+
writer_id: 1enfm68bplbg421e1aqnc3guby2j3hk6
|
| 80 |
+
model:
|
| 81 |
+
desc: null
|
| 82 |
+
value:
|
| 83 |
+
config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
|
| 84 |
+
checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
|
| 85 |
+
training:
|
| 86 |
+
desc: null
|
| 87 |
+
value:
|
| 88 |
+
epochs: 3
|
| 89 |
+
batch_size: 4
|
| 90 |
+
eval_batch_size: 24
|
| 91 |
+
gradient_accumulation_steps: 4
|
| 92 |
+
lr: 0.0001
|
| 93 |
+
weight_decay: 0.1
|
| 94 |
+
betas:
|
| 95 |
+
- 0.9
|
| 96 |
+
- 0.95
|
| 97 |
+
eps: 1.0e-08
|
| 98 |
+
lr_scheduler: wsd
|
| 99 |
+
warmup_ratio: 0.1
|
| 100 |
+
decay_ratio: 0.2
|
| 101 |
+
warmup_steps: 100
|
| 102 |
+
min_lr_ratio: 0.1
|
| 103 |
+
lr_multiplier:
|
| 104 |
+
- 2.0
|
| 105 |
+
- 1.5
|
| 106 |
+
- 1.0
|
| 107 |
+
load_balancing_weight: 0.01
|
| 108 |
+
load_balancing_N: 4.0
|
| 109 |
+
max_grad_norm: 1.0
|
| 110 |
+
use_amp: true
|
| 111 |
+
resume: false
|
| 112 |
+
resume_checkpoint: null
|
| 113 |
+
warmup_model: true
|
| 114 |
+
data:
|
| 115 |
+
desc: null
|
| 116 |
+
value:
|
| 117 |
+
path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V5_full
|
| 118 |
+
max_context_len: 4096
|
| 119 |
+
max_target_len: 256
|
| 120 |
+
num_workers: 0
|
| 121 |
+
pin_memory: true
|
| 122 |
+
max_train_samples: null
|
| 123 |
+
max_val_samples: null
|
| 124 |
+
logging:
|
| 125 |
+
desc: null
|
| 126 |
+
value:
|
| 127 |
+
log_interval: 10
|
| 128 |
+
save_interval: 3000
|
| 129 |
+
eval_interval: 1000
|
| 130 |
+
save_every_epoch: true
|
| 131 |
+
tracking:
|
| 132 |
+
desc: null
|
| 133 |
+
value:
|
| 134 |
+
enabled: true
|
| 135 |
+
backend: wandb
|
| 136 |
+
project: code-completion-full-docstring
|
| 137 |
+
run_name: hnet_train
|
| 138 |
+
entity: null
|
| 139 |
+
base_url: https://wandb.platun0v.ru
|
| 140 |
+
paths:
|
| 141 |
+
desc: null
|
| 142 |
+
value:
|
| 143 |
+
output_dir: outputs/2026-04-17/08-57-56
|
| 144 |
+
seed:
|
| 145 |
+
desc: null
|
| 146 |
+
value: 42
|
| 147 |
+
device:
|
| 148 |
+
desc: null
|
| 149 |
+
value: cuda
|
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/output.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/wandb-summary.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_runtime": 24431, "train/loss": 0.2280647549778223, "train/loss_avg": 0.20581580642644212, "train/lb_loss": 1.0840826034545898, "train/epoch": 3, "train/lm_loss": 0.17879723012447357, "train/lr": 1e-05, "train/step_time": 0.6833733320236206, "train/chunk_len_stage0": 2.8951674623156975, "train/hard_boundary_ratio_stage1": 0.4156030081584442, "train/soft_boundary_ratio_stage0": 0.3467458609387149, "train/chunk_len_stage1": 2.415668312621829, "train/hard_boundary_ratio_stage0": 0.3473884895159008, "train/soft_boundary_ratio_stage1": 0.38436046590206835, "_timestamp": 1776440679.1860769, "_step": 31770, "best/val_loss": 0.3120614947675138, "val/perplexity": 1.3806528571733947, "val/loss": 0.3327806241316151, "best/val_perplexity": 1.352240898860508, "best/step": 10000, "val/lm_loss": 0.3219491058048241, "val/lb_loss": 1.0831518805756861, "val/time": 107.17485237121582, "epoch/lm_loss": 0.19445337861685316, "epoch/chunk_len_stage1": 2.4157022583211734, "epoch/time": 7926.80414223671, "epoch/chunk_len_stage0": 2.895147230253075, "epoch/hard_boundary_ratio_stage0": 0.34739050784790954, "epoch/soft_boundary_ratio_stage1": 0.38435656324014733, "epoch/lb_loss": 1.0874687482092606, "epoch/hard_boundary_ratio_stage1": 0.41559792626858155, "epoch/soft_boundary_ratio_stage0": 0.3467473082940784, "epoch/loss": 0.20583635369858655}
|