File size: 4,571 Bytes
5dba742 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | #!/usr/bin/env python3
"""
Configuration for DevOps-Engineer-SLM: A Role-Based SLM for DevOps Engineer.
~1B params, LLaMA-style architecture with RoPE β supports up to 5M token context.
"""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
@dataclass
class SLMConfig:
"""All hyperparameters and paths in one place."""
# ββ Project paths ββββββββββββββββββββββββββββββββββββββββββββββ
project_dir: Path = Path(__file__).resolve().parent
data_dir: Path = field(default=None)
tokenizer_dir: Path = field(default=None)
checkpoint_dir: Path = field(default=None)
# ββ Domain βββββββββββββββββββββββββββββββββββββββββββββββββββββ
domain_name: str = "DevOps Engineer"
domain_slug: str = "devops_engineer"
tokenizer_filename: str = "devops_engineer_tokenizer.json"
# ββ Tokenizer ββββββββββββββββββββββββββββββββββββββββββββββββββ
vocab_size: int = 32_768
min_frequency: int = 2
special_tokens: list = field(
default_factory=lambda: [
"<pad>", "<unk>", "<bos>", "<eos>",
"<|system|>", "<|user|>", "<|assistant|>",
]
)
# ββ Model (~1B params, LLaMA-style with RoPE) βββββββββββββββββ
n_layer: int = 32
n_head: int = 20
n_embd: int = 1600
block_size: int = 1_000_000 # 1M input token context window
dropout: float = 0.05
bias: bool = False
ffn_multiplier: float = 2.667
# ββ RoPE βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
max_position_embeddings: int = 5_000_000 # 5M context window via RoPE
rope_theta: float = 5_000_000.0 # Scaled for 5M context window
# ββ Sliding Window βββββββββββββββββββββββββββββββββββββββββββββ
sliding_window: Optional[int] = None
# ββ Gradient Checkpointing (essential for 1B on 24GB) ββββββββββ
gradient_checkpointing: bool = True
# ββ Training βββββββββββββββββββββββββββββββββββββββββββββββββββ
batch_size: int = 1
gradient_accumulation_steps: int = 16
learning_rate: float = 2e-4
weight_decay: float = 0.1
max_epochs: int = 3
dataset_stride: int = 512 # Training stride
warmup_steps: int = 100
grad_clip: float = 1.0
eval_interval: int = 50
eval_samples: int = 10
log_interval: int = 10
device: str = "auto"
# ββ Generation βββββββββββββββββββββββββββββββββββββββββββββββββ
max_new_tokens: int = 5_000_000 # 5M max output tokens
temperature: float = 0.8
top_k: int = 50
top_p: float = 0.9
# ββ HuggingFace ββββββββββββββββββββββββββββββββββββββββββββββββ
hf_repo_name: str = "devops-engineer-slm-5m"
hf_model_card_tags: list = field(default_factory=lambda: ['devops', 'cicd', 'docker', 'kubernetes', 'infrastructure', 'slm', 'llama-style', 'rope', '5m-context', 'from-scratch', '1b-params'])
def __post_init__(self):
if self.data_dir is None:
self.data_dir = self.project_dir / "data"
if self.tokenizer_dir is None:
self.tokenizer_dir = self.project_dir / "tokenizer"
if self.checkpoint_dir is None:
self.checkpoint_dir = self.project_dir / "checkpoints"
self.data_dir.mkdir(parents=True, exist_ok=True)
self.tokenizer_dir.mkdir(parents=True, exist_ok=True)
self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
if self.device == "auto":
import torch
if torch.cuda.is_available():
self.device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
self.device = "mps"
else:
self.device = "cpu"
cfg = SLMConfig()
|