narySt commited on Apr 18

Commit

f7b8074

verified ·

1 Parent(s): e98d476

Add files using upload-large-folder tool

Browse files

Files changed (18) hide show

train_hnet_with_docstring_18_04/checkpoints/checkpoint_latest.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_10591.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_12000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_15000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_18000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21182.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_24000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_27000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_3000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_30000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_31773.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_6000.pt +3 -0
train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_9000.pt +3 -0
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/code/code_completion_exp/train_hnet/train.py +284 -0
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/config.yaml +149 -0
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/output.log +0 -0
train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/wandb-summary.json +1 -0

train_hnet_with_docstring_18_04/checkpoints/checkpoint_latest.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:961e914acf254f75d3accdac545096cd7e66da917bf6c963bea9be50aa32f8ed
+size 9945483438

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_10591.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ea9ea7277f4386806ae2b7d1b24fe9685a784ba1b67eb42de3289235b795f22
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_12000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8366292499e86c4d205f93253a7d2637fb2906de8e007d91b761df58d4b3e73
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_15000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e3552eccc68fefd58c733b162db0de202d60b76ae8e17478366e21a3e7ffa96
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_18000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42018b9f036477e626fce515690532aa077d1c047a0a0f10fd2ebb543129b7a4
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ae7bba9fbba6448d718d6409b669077cdcc0f8a64bf7a212b50ce321e6c6f2c
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_21182.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:040b60ed6ec60b823c58a2b90120741e53a3c56f57288f5dfeb7e60e1e665670
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_24000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e2b3de0508edc501ab43c173056ecde4fcdb1768eb1d9ec409ff6c5389deaea
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_27000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40892f5e0016e59f15f215e22cc48166c79f06b71171de4e21917e911efba4a3
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_3000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26d89d1f75945923fb330722f732782c4eca586acf469e4cd5e870e8b06dd039
+size 9945490614

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_30000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:823dd047ef1656da439ac9357b55e474a12794282d6ebe8f5d2a51139f8ce0ba
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_31773.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86d505541cd3af319175317281d89138ff0f0f9101224b2a323ea1de4edc5a8e
+size 9945491982

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_6000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9201a7652769dbd11bb72591759a279f796a3fcd5a6bf5117e97727ce561bf5
+size 9945490614

train_hnet_with_docstring_18_04/checkpoints/checkpoint_step_9000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:130d0d1c2703407af109e8dde865191f4968766ebf0dc68e42890363c8f9c43e
+size 9945490614

train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/code/code_completion_exp/train_hnet/train.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+Training Pipeline для HNet модели на задаче Code Completion.
+Конфигурация через Hydra + OmegaConf, логирование в Trackio.
+Поддержка DDP через Accelerate для multi-GPU тренировки.
+Использование:
+    # Базовый запуск (single GPU)
+    python train.py
+    # Multi-GPU с Accelerate
+    accelerate launch train.py
+    # Multi-GPU с указанием количества GPU
+    accelerate launch --num_processes=4 train.py
+    # Переопределение параметров через CLI
+    python train.py training.lr=1e-4 training.epochs=5
+    # Выбор другого конфига модели
+    python train.py model=hnet_small
+    # Multirun (sweep)
+    python train.py --multirun training.lr=1e-4,3e-4,1e-3
+    # Без логирования
+    python train.py tracking.enabled=false
+"""
+import os
+import math
+from pathlib import Path
+import torch
+import hydra
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import DictConfig, OmegaConf
+from accelerate import Accelerator
+from accelerate.utils import set_seed as accelerate_set_seed
+# HNet imports
+from hnet.load_utils import load_from_pretrained, load_from_config
+from hnet.utils.tokenizers import ByteTokenizer
+from hnet.utils.train import group_params
+# Ensure repo root is on sys.path (needed when running from subdirectory)
+import sys
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+# Shared training library
+from training_lib.utils import log_message
+from training_lib.checkpointing import save_checkpoint, load_checkpoint
+from training_lib.schedulers import get_lr_scheduler
+from training_lib.tracking import init_tracking, finish_tracking
+from training_lib.hnet.train_loop import train_epoch
+from training_lib.hnet.data import create_dataloaders
+@hydra.main(version_base=None, config_path="configs", config_name="config")
+def main(cfg: DictConfig):
+    """Глав��ая функция тренировки с поддержкой DDP чере�� Accelerate."""
+    # === Accelerator Setup ===
+    mixed_precision = "bf16" if cfg.training.use_amp else "no"
+    accelerator = Accelerator(
+        mixed_precision=mixed_precision,
+        gradient_accumulation_steps=cfg.training.gradient_accumulation_steps,
+    )
+    # === Setup ===
+    accelerate_set_seed(cfg.seed)
+    if cfg.paths.output_dir is None:
+        cfg.paths.output_dir = HydraConfig.get().runtime.output_dir
+    OmegaConf.resolve(cfg)
+    log_message(
+        f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'not set')}",
+        cfg,
+        accelerator,
+    )
+    log_message(f"Number of processes: {accelerator.num_processes}", cfg, accelerator)
+    log_message(f"Process index: {accelerator.process_index}", cfg, accelerator)
+    log_message(f"Mixed precision: {mixed_precision}", cfg, accelerator)
+    log_message("=" * 60, cfg, accelerator)
+    log_message(
+        "HNet Training Pipeline (Hydra + Trackio + Accelerate)", cfg, accelerator
+    )
+    log_message("=" * 60, cfg, accelerator)
+    log_message(f"Config:\n{OmegaConf.to_yaml(cfg)}", cfg, accelerator)
+    # === Trackio Init ===
+    init_tracking(cfg, accelerator)
+    # === Tokenizer ===
+    log_message("Initializing tokenizer...", cfg, accelerator)
+    tokenizer = ByteTokenizer()
+    # === Model ===
+    log_message("Loading model...", cfg, accelerator)
+    if cfg.model.checkpoint_path:
+        model = load_from_pretrained(
+            model_path=cfg.model.checkpoint_path,
+            model_config_path=cfg.model.config_path,
+        )
+        log_message(f"Loaded pretrained: {cfg.model.checkpoint_path}", cfg, accelerator)
+    else:
+        model = load_from_config(
+            model_config_path=cfg.model.config_path,
+            device="cpu",
+        )
+        model.init_weights()
+        log_message("Initialized from scratch", cfg, accelerator)
+    model.train()
+    # LR multiplier для разны�� стадий (до prepare!)
+    lr_multiplier = list(cfg.training.lr_multiplier)
+    model.apply_lr_multiplier(lr_multiplier)
+    log_message(f"Applied LR multipliers: {lr_multiplier}", cfg, accelerator)
+    # Warmup для Triton kernels
+    if cfg.training.warmup_model:
+        log_message("Warming up model...", cfg, accelerator)
+        model = model.to(accelerator.device)
+        model.warmup(verbose=accelerator.is_main_process)
+    # Log model info
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    log_message(f"Total params: {total_params:,}", cfg, accelerator)
+    log_message(f"Trainable params: {trainable_params:,}", cfg, accelerator)
+    # === Data ===
+    log_message("Creating dataloaders...", cfg, accelerator)
+    dataloaders = create_dataloaders(cfg, tokenizer)
+    train_dataloader = dataloaders["train"]
+    val_dataloader = dataloaders.get("validation", None)
+    log_message(
+        f"Train dataset size: {len(train_dataloader.dataset)}", cfg, accelerator
+    )
+    log_message(
+        f"Train batches per epoch (before DDP split): {len(train_dataloader)}",
+        cfg,
+        accelerator,
+    )
+    if val_dataloader:
+        log_message(
+            f"Validation dataset size: {len(val_dataloader.dataset)}", cfg, accelerator
+        )
+        log_message(f"Validation batches: {len(val_dataloader)}", cfg, accelerator)
+    else:
+        log_message("No validation dataset found", cfg, accelerator)
+    # === Optimizer ===
+    log_message("Creating optimizer...", cfg, accelerator)
+    param_groups = group_params(model)
+    for group in param_groups:
+        if "lr" not in group:
+            group["lr"] = cfg.training.lr
+        else:
+            group["lr"] = cfg.training.lr * group.get("lr_multiplier", 1.0)
+        if "weight_decay" not in group:
+            group["weight_decay"] = cfg.training.weight_decay
+    optimizer = torch.optim.AdamW(
+        param_groups,
+        lr=cfg.training.lr,
+        betas=tuple(cfg.training.betas),
+        eps=cfg.training.eps,
+    )
+    # === Scheduler ===
+    steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.num_processes)
+    total_steps = (
+        cfg.training.epochs
+        * steps_per_epoch
+        // cfg.training.gradient_accumulation_steps
+    )
+    scheduler = get_lr_scheduler(optimizer, cfg, total_steps)
+    log_message(
+        f"Total steps: {total_steps}, Steps per epoch: {steps_per_epoch}",
+        cfg,
+        accelerator,
+    )
+    # === Accelerate Prepare ===
+    log_message(
+        "Preparing model, optimizer, and dataloaders with Accelerate...",
+        cfg,
+        accelerator,
+    )
+    if val_dataloader is not None:
+        model, optimizer, train_dataloader, val_dataloader, scheduler = (
+            accelerator.prepare(
+                model, optimizer, train_dataloader, val_dataloader, scheduler
+            )
+        )
+    else:
+        model, optimizer, train_dataloader, scheduler = accelerator.prepare(
+            model, optimizer, train_dataloader, scheduler
+        )
+    log_message(
+        f"Train batches per epoch (after DDP split): {len(train_dataloader)}",
+        cfg,
+        accelerator,
+    )
+    # === Resume ===
+    global_step = 0
+    start_epoch = 1
+    if cfg.training.resume and cfg.training.resume_checkpoint:
+        global_step, start_epoch = load_checkpoint(
+            model,
+            optimizer,
+            scheduler,
+            cfg.training.resume_checkpoint,
+            cfg,
+            accelerator,
+        )
+        start_epoch += 1
+    # === Training Loop ===
+    log_message("Starting training...", cfg, accelerator)
+    best_val_loss = float("inf")
+    try:
+        for epoch in range(start_epoch, cfg.training.epochs + 1):
+            log_message(f"\n{'=' * 60}", cfg, accelerator)
+            log_message(f"EPOCH {epoch}/{cfg.training.epochs}", cfg, accelerator)
+            log_message(f"{'=' * 60}", cfg, accelerator)
+            global_step, best_val_loss = train_epoch(
+                model=model,
+                dataloader=train_dataloader,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                cfg=cfg,
+                epoch=epoch,
+                global_step=global_step,
+                accelerator=accelerator,
+                val_dataloader=val_dataloader,
+                best_val_loss=best_val_loss,
+            )
+            if cfg.logging.save_every_epoch:
+                save_checkpoint(
+                    model, optimizer, scheduler, global_step, epoch, cfg, accelerator
+                )
+    except KeyboardInterrupt:
+        log_message("Training interrupted by user", cfg, accelerator)
+        save_checkpoint(
+            model, optimizer, scheduler, global_step, epoch, cfg, accelerator
+        )
+    # === Final Save ===
+    log_message("\nTraining completed!", cfg, accelerator)
+    if accelerator.is_main_process:
+        final_model_path = Path(cfg.paths.output_dir) / "model_final.pt"
+        unwrapped_model = accelerator.unwrap_model(model)
+        torch.save(unwrapped_model.state_dict(), final_model_path)
+        log_message(f"Final model: {final_model_path}", cfg, accelerator)
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+    finish_tracking()
+if __name__ == "__main__":
+    main()

train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/config.yaml ADDED Viewed

	@@ -0,0 +1,149 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    code_path: code/code_completion_exp/train_hnet/train.py
+    python_version: 3.12.0
+    cli_version: 0.24.0
+    framework: huggingface
+    huggingface_version: 4.57.6
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1776416277
+    t:
+      1:
+      - 1
+      - 11
+      - 49
+      - 50
+      - 51
+      - 71
+      - 105
+      2:
+      - 1
+      - 11
+      - 49
+      - 50
+      - 51
+      - 71
+      - 105
+      3:
+      - 2
+      - 13
+      - 16
+      - 37
+      - 42
+      - 61
+      4: 3.12.0
+      5: 0.24.0
+      6: 4.57.6
+      13: linux-x86_64
+    e:
+      1enfm68bplbg421e1aqnc3guby2j3hk6:
+        os: Linux-5.15.0-173-generic-x86_64-with-glibc2.39
+        python: CPython 3.12.0
+        started_at: '2026-04-17T08:57:57.464191Z'
+        program: /workspace/byte-llms-code/code_completion_exp/train_hnet/train.py
+        code_path: code_completion_exp/train_hnet/train.py
+        code_path_local: train.py
+        git:
+          remote_url: https://github.com/naryst/byte-llms-code.git
+          commit: 056a135fbb34bc28ed3adfeeb2f4ac97cbf12a89
+        email: nikita@local.ru
+        root: /workspace/byte-llms-code/code_completion_exp/train_hnet
+        host: 3e675e030992
+        executable: /venv/bytellm/bin/python
+        cpu_count: 112
+        cpu_count_logical: 224
+        gpu_type: NVIDIA H100 80GB HBM3
+        gpu_count: 2
+        disk:
+          /:
+            total: '244813135872'
+            used: '36382741504'
+        memory:
+          total: '1622968434688'
+        gpu_nvidia:
+        - name: NVIDIA H100 80GB HBM3
+          memory_total: '85520809984'
+          cuda_cores: 16896
+          architecture: Hopper
+          uuid: GPU-3c87d2f8-c595-49bd-bb1d-1ebfd19c6fb0
+        - name: NVIDIA H100 80GB HBM3
+          memory_total: '85520809984'
+          cuda_cores: 16896
+          architecture: Hopper
+          uuid: GPU-beb9a6b0-ebef-1f4c-d886-465c96f57ca4
+        cuda_version: '12.9'
+        writer_id: 1enfm68bplbg421e1aqnc3guby2j3hk6
+model:
+  desc: null
+  value:
+    config_path: /workspace/byte-llms-code/hnet_project/configs/hnet_2stage_XL_code.json
+    checkpoint_path: /workspace/byte-llms-code/hnet_project/checkpoints/hnet_2stage_XL_code.pt
+training:
+  desc: null
+  value:
+    epochs: 3
+    batch_size: 4
+    eval_batch_size: 24
+    gradient_accumulation_steps: 4
+    lr: 0.0001
+    weight_decay: 0.1
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    lr_scheduler: wsd
+    warmup_ratio: 0.1
+    decay_ratio: 0.2
+    warmup_steps: 100
+    min_lr_ratio: 0.1
+    lr_multiplier:
+    - 2.0
+    - 1.5
+    - 1.0
+    load_balancing_weight: 0.01
+    load_balancing_N: 4.0
+    max_grad_norm: 1.0
+    use_amp: true
+    resume: false
+    resume_checkpoint: null
+    warmup_model: true
+data:
+  desc: null
+  value:
+    path: /workspace/byte-llms-code/code_completion_exp/datasets/data_V5_full
+    max_context_len: 4096
+    max_target_len: 256
+    num_workers: 0
+    pin_memory: true
+    max_train_samples: null
+    max_val_samples: null
+logging:
+  desc: null
+  value:
+    log_interval: 10
+    save_interval: 3000
+    eval_interval: 1000
+    save_every_epoch: true
+tracking:
+  desc: null
+  value:
+    enabled: true
+    backend: wandb
+    project: code-completion-full-docstring
+    run_name: hnet_train
+    entity: null
+    base_url: https://wandb.platun0v.ru
+paths:
+  desc: null
+  value:
+    output_dir: outputs/2026-04-17/08-57-56
+seed:
+  desc: null
+  value: 42
+device:
+  desc: null
+  value: cuda

train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/output.log ADDED Viewed

The diff for this file is too large to render. See raw diff

train_hnet_with_docstring_18_04/wandb/run-20260417_085757-sa79g3yl/files/wandb-summary.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"_runtime": 24431, "train/loss": 0.2280647549778223, "train/loss_avg": 0.20581580642644212, "train/lb_loss": 1.0840826034545898, "train/epoch": 3, "train/lm_loss": 0.17879723012447357, "train/lr": 1e-05, "train/step_time": 0.6833733320236206, "train/chunk_len_stage0": 2.8951674623156975, "train/hard_boundary_ratio_stage1": 0.4156030081584442, "train/soft_boundary_ratio_stage0": 0.3467458609387149, "train/chunk_len_stage1": 2.415668312621829, "train/hard_boundary_ratio_stage0": 0.3473884895159008, "train/soft_boundary_ratio_stage1": 0.38436046590206835, "_timestamp": 1776440679.1860769, "_step": 31770, "best/val_loss": 0.3120614947675138, "val/perplexity": 1.3806528571733947, "val/loss": 0.3327806241316151, "best/val_perplexity": 1.352240898860508, "best/step": 10000, "val/lm_loss": 0.3219491058048241, "val/lb_loss": 1.0831518805756861, "val/time": 107.17485237121582, "epoch/lm_loss": 0.19445337861685316, "epoch/chunk_len_stage1": 2.4157022583211734, "epoch/time": 7926.80414223671, "epoch/chunk_len_stage0": 2.895147230253075, "epoch/hard_boundary_ratio_stage0": 0.34739050784790954, "epoch/soft_boundary_ratio_stage1": 0.38435656324014733, "epoch/lb_loss": 1.0874687482092606, "epoch/hard_boundary_ratio_stage1": 0.41559792626858155, "epoch/soft_boundary_ratio_stage0": 0.3467473082940784, "epoch/loss": 0.20583635369858655}