#!/usr/bin/env python3 """ Configuration for DevOps-Engineer-SLM: A Role-Based SLM for DevOps Engineer. ~1B params, LLaMA-style architecture with RoPE — supports up to 1M token context. """ from dataclasses import dataclass, field from pathlib import Path from typing import Optional @dataclass class SLMConfig: """All hyperparameters and paths in one place.""" # ── Project paths ────────────────────────────────────────────── project_dir: Path = Path(__file__).resolve().parent data_dir: Path = field(default=None) tokenizer_dir: Path = field(default=None) checkpoint_dir: Path = field(default=None) # ── Domain ───────────────────────────────────────────────────── domain_name: str = "DevOps Engineer" domain_slug: str = "devops_engineer" tokenizer_filename: str = "devops_engineer_tokenizer.json" # ── Tokenizer ────────────────────────────────────────────────── vocab_size: int = 32_768 min_frequency: int = 2 special_tokens: list = field( default_factory=lambda: [ "", "", "", "", "<|system|>", "<|user|>", "<|assistant|>", ] ) # ── Model (~1B params, LLaMA-style with RoPE) ───────────────── n_layer: int = 32 n_head: int = 20 n_embd: int = 1600 block_size: int = 512 dropout: float = 0.05 bias: bool = False ffn_multiplier: float = 2.667 # ── RoPE ─────────────────────────────────────────────────────── max_position_embeddings: int = 100_000_000_000 # 100B tokens via RoPE rope_theta: float = 50_000_000_000.0 # Scaled for 100B context # ── Sliding Window ───────────────────────────────────────────── sliding_window: Optional[int] = None # ── Gradient Checkpointing (essential for 1B on 24GB) ────────── gradient_checkpointing: bool = True # ── Training ─────────────────────────────────────────────────── batch_size: int = 1 gradient_accumulation_steps: int = 16 learning_rate: float = 2e-4 weight_decay: float = 0.1 max_epochs: int = 3 dataset_stride: int = 512 warmup_steps: int = 100 grad_clip: float = 1.0 eval_interval: int = 50 eval_samples: int = 10 log_interval: int = 10 device: str = "auto" # ── Generation ───────────────────────────────────────────────── max_new_tokens: int = 1_000_000 # 1M output tokens temperature: float = 0.8 top_k: int = 50 top_p: float = 0.9 # ── HuggingFace ──────────────────────────────────────────────── hf_repo_name: str = "devops-engineer-slm-1m" hf_model_card_tags: list = field(default_factory=lambda: ['devops', 'cicd', 'docker', 'kubernetes', 'infrastructure', 'slm', 'llama-style', 'rope', '1m-context', 'from-scratch', '1b-params']) def __post_init__(self): if self.data_dir is None: self.data_dir = self.project_dir / "data" if self.tokenizer_dir is None: self.tokenizer_dir = self.project_dir / "tokenizer" if self.checkpoint_dir is None: self.checkpoint_dir = self.project_dir / "checkpoints" self.data_dir.mkdir(parents=True, exist_ok=True) self.tokenizer_dir.mkdir(parents=True, exist_ok=True) self.checkpoint_dir.mkdir(parents=True, exist_ok=True) if self.device == "auto": import torch if torch.cuda.is_available(): self.device = "cuda" elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): self.device = "mps" else: self.device = "cpu" cfg = SLMConfig()