Instructions to use dkumar15/aria-1b-chat with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use dkumar15/aria-1b-chat with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="dkumar15/aria-1b-chat")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("dkumar15/aria-1b-chat")
model = AutoModelForCausalLM.from_pretrained("dkumar15/aria-1b-chat")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use dkumar15/aria-1b-chat with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "dkumar15/aria-1b-chat"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "dkumar15/aria-1b-chat",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/dkumar15/aria-1b-chat

SGLang

How to use dkumar15/aria-1b-chat with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "dkumar15/aria-1b-chat" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "dkumar15/aria-1b-chat",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "dkumar15/aria-1b-chat" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "dkumar15/aria-1b-chat",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use dkumar15/aria-1b-chat with Docker Model Runner:
```
docker model run hf.co/dkumar15/aria-1b-chat
```

dkumar15 commited on Mar 5

Commit

d42a1f3

verified ·

1 Parent(s): 72372ef

Upload training_code/train.py with huggingface_hub

Browse files

Files changed (1) hide show

training_code/train.py +257 -0

training_code/train.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+Distributed training script for 1B parameter Transformer.
+Launch: torchrun --nproc_per_node=8 train.py
+Stack: PyTorch DDP + BF16 autocast + 8x H100 80GB
+"""
+import os
+import sys
+import math
+import time
+import json
+import datetime
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from model.config import ModelConfig, TrainConfig
+from model.transformer import Transformer
+from model.data import get_tokenizer, create_dataloader
+def get_wsd_lr(step, warmup_steps, total_steps, max_lr, min_lr):
+    """Warmup-Stable-Decay: linear warmup -> constant -> cosine decay (last 20%)."""
+    stable_end = int(total_steps * 0.8)
+    if step < warmup_steps:
+        return max_lr * step / max(warmup_steps, 1)
+    elif step < stable_end:
+        return max_lr
+    else:
+        progress = (step - stable_end) / max(total_steps - stable_end, 1)
+        return min_lr + 0.5 * (max_lr - min_lr) * (1 + math.cos(math.pi * progress))
+def find_latest_checkpoint(checkpoint_dir):
+    """Find the latest step_*.pt checkpoint in the directory."""
+    import glob
+    pattern = os.path.join(checkpoint_dir, "step_*.pt")
+    files = glob.glob(pattern)
+    if not files:
+        return None, 0
+    latest = max(files, key=lambda f: int(os.path.basename(f).replace("step_", "").replace(".pt", "")))
+    step = int(os.path.basename(latest).replace("step_", "").replace(".pt", ""))
+    return latest, step
+def main():
+    dist.init_process_group("nccl", timeout=datetime.timedelta(minutes=30))
+    rank = int(os.environ.get("RANK", 0))
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+    model_config = ModelConfig()
+    train_config = TrainConfig()
+    eff_batch = train_config.batch_size_per_gpu * world_size * train_config.gradient_accumulation_steps
+    tokens_per_step = eff_batch * model_config.max_seq_len
+    total_steps = train_config.total_tokens // tokens_per_step
+    if rank == 0:
+        os.makedirs(train_config.log_dir, exist_ok=True)
+        os.makedirs(train_config.checkpoint_dir, exist_ok=True)
+        print("=" * 70)
+        print(f"  TRAINING 1B TRANSFORMER FROM SCRATCH")
+        print(f"  Arch: {model_config.num_layers}L / {model_config.hidden_dim}D / "
+              f"{model_config.num_attention_heads}H / GQA-{model_config.num_kv_heads}KV / "
+              f"SwiGLU-{model_config.intermediate_dim}")
+        print(f"  Seq: {model_config.max_seq_len} | Vocab: {model_config.vocab_size}")
+        print(f"  GPUs: {world_size}x H100 80GB | Backend: DDP + BF16 autocast")
+        print(f"  Batch: {eff_batch} seqs = {tokens_per_step:,} tok/step")
+        print(f"  Steps: {total_steps:,} | Target: {train_config.total_tokens:,} tokens")
+        print("=" * 70)
+    # Tokenizer
+    tokenizer = get_tokenizer()
+    # Model
+    torch.manual_seed(train_config.seed)
+    model = Transformer(model_config).to(device)
+    if rank == 0:
+        n = sum(p.numel() for p in model.parameters())
+        print(f"[Init] Params: {n:,} ({n/1e9:.3f}B)")
+    model = DDP(model, device_ids=[local_rank])
+    # Optimizer
+    decay_params = [p for n, p in model.named_parameters() if p.dim() >= 2 and p.requires_grad]
+    nodecay_params = [p for n, p in model.named_parameters() if p.dim() < 2 and p.requires_grad]
+    optimizer = torch.optim.AdamW([
+        {"params": decay_params, "weight_decay": train_config.weight_decay},
+        {"params": nodecay_params, "weight_decay": 0.0},
+    ], lr=train_config.learning_rate, betas=(train_config.beta1, train_config.beta2), fused=True)
+    if rank == 0:
+        dp = sum(p.numel() for p in decay_params)
+        ndp = sum(p.numel() for p in nodecay_params)
+        print(f"[Init] Optimizer: {dp:,} decay + {ndp:,} no-decay params")
+    # Resume from checkpoint
+    resume_step = 0
+    ckpt_path, ckpt_step = find_latest_checkpoint(train_config.checkpoint_dir)
+    if ckpt_path is not None:
+        if rank == 0:
+            print(f"[Resume] Loading checkpoint: {ckpt_path} (step {ckpt_step})")
+        ckpt = torch.load(ckpt_path, map_location=device, weights_only=False)
+        model.module.load_state_dict(ckpt["model"])
+        optimizer.load_state_dict(ckpt["optimizer"])
+        resume_step = ckpt["step"]
+        if rank == 0:
+            print(f"[Resume] Restored model + optimizer at step {resume_step}, "
+                  f"loss was {ckpt.get('loss', 'N/A')}")
+        del ckpt
+        torch.cuda.empty_cache()
+    else:
+        if rank == 0:
+            print("[Init] No checkpoint found, starting from scratch")
+    # Data — use (seed + resume_step) so resumed runs see different shuffled data
+    effective_seed = train_config.seed + resume_step
+    dataloader = create_dataloader(tokenizer, train_config, rank=rank, world_size=world_size,
+                                   seed_override=effective_seed)
+    data_iter = iter(dataloader)
+    if rank == 0:
+        print(f"[Init] Dataloader ready (streaming FineWeb-Edu 10BT)")
+        print(f"[Schedule] WSD: warmup {train_config.warmup_steps} -> "
+              f"stable {int(total_steps*0.8)} -> decay {total_steps}")
+        if resume_step > 0:
+            remaining = total_steps - resume_step
+            print(f"[Resume] Continuing from step {resume_step}, {remaining:,} steps remaining")
+        print("-" * 70)
+        sys.stdout.flush()
+    # ===== TRAINING LOOP =====
+    model.train()
+    global_step = resume_step
+    running_loss = 0.0
+    best_loss = float("inf")
+    tokens_done = resume_step * tokens_per_step
+    t0 = time.time()
+    step_t0 = time.time()
+    log_file = open(os.path.join(train_config.log_dir, "train_log.jsonl"), "a") if rank == 0 else None
+    while global_step < total_steps:
+        optimizer.zero_grad(set_to_none=True)
+        micro_loss = 0.0
+        for micro in range(train_config.gradient_accumulation_steps):
+            try:
+                input_ids, labels = next(data_iter)
+            except StopIteration:
+                data_iter = iter(dataloader)
+                input_ids, labels = next(data_iter)
+            input_ids = input_ids.to(device, non_blocking=True)
+            labels = labels.to(device, non_blocking=True)
+            # BF16 autocast — no scaler needed (BF16 has enough dynamic range)
+            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+                _, loss = model(input_ids, labels)
+                loss = loss / train_config.gradient_accumulation_steps
+            loss.backward()
+            micro_loss += loss.item()
+        # Gradient clipping
+        torch.nn.utils.clip_grad_norm_(model.parameters(), train_config.grad_clip)
+        # LR schedule
+        lr = get_wsd_lr(global_step, train_config.warmup_steps, total_steps,
+                        train_config.learning_rate, train_config.min_lr)
+        for pg in optimizer.param_groups:
+            pg["lr"] = lr
+        optimizer.step()
+        global_step += 1
+        running_loss += micro_loss
+        tokens_done += tokens_per_step
+        # Log
+        if global_step % train_config.log_interval == 0:
+            dt = time.time() - step_t0
+            tps = (train_config.log_interval * tokens_per_step) / max(dt, 1e-9)
+            avg = running_loss / train_config.log_interval
+            elapsed = time.time() - t0
+            pct = 100.0 * global_step / total_steps
+            eta = (elapsed / max(global_step, 1)) * (total_steps - global_step)
+            if rank == 0:
+                gpu_mem = torch.cuda.max_memory_allocated(device) / 1e9
+                print(
+                    f"[Step {global_step:>6d}/{total_steps}] "
+                    f"loss={avg:.4f} | lr={lr:.2e} | "
+                    f"tok/s={tps:,.0f} | GPU={gpu_mem:.1f}GB | "
+                    f"{pct:.1f}% | ETA={eta/3600:.1f}h",
+                    flush=True,
+                )
+                if log_file:
+                    log_file.write(json.dumps({
+                        "step": global_step, "loss": round(avg, 4), "lr": lr,
+                        "tps": round(tps), "tokens": tokens_done,
+                        "gpu_gb": round(gpu_mem, 1), "elapsed_s": round(elapsed, 1),
+                    }) + "\n")
+                    log_file.flush()
+            if avg < best_loss:
+                best_loss = avg
+            running_loss = 0.0
+            step_t0 = time.time()
+        # Checkpoint
+        if global_step % train_config.save_interval == 0:
+            dist.barrier()
+            if rank == 0:
+                ckpt_path = os.path.join(train_config.checkpoint_dir, f"step_{global_step}.pt")
+                torch.save({
+                    "step": global_step,
+                    "model": model.module.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "loss": avg if global_step % train_config.log_interval == 0 else micro_loss,
+                    "config": {"model": model_config.__dict__, "train": train_config.__dict__},
+                }, ckpt_path)
+                print(f"  >> Checkpoint: {ckpt_path}", flush=True)
+            dist.barrier()
+    # Final
+    dist.barrier()
+    if rank == 0:
+        final_path = os.path.join(train_config.checkpoint_dir, "final.pt")
+        torch.save({
+            "step": global_step,
+            "model": model.module.state_dict(),
+            "config": {"model": model_config.__dict__, "train": train_config.__dict__},
+        }, final_path)
+        total_time = time.time() - t0
+        print("=" * 70)
+        print(f"  TRAINING COMPLETE")
+        print(f"  Steps: {global_step:,} | Tokens: {tokens_done:,}")
+        print(f"  Time: {total_time/3600:.2f}h | Throughput: {tokens_done/total_time:,.0f} tok/s")
+        print(f"  Best loss: {best_loss:.4f}")
+        print(f"  Final model: {final_path}")
+        print("=" * 70)
+        if log_file:
+            log_file.close()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    main()