Omk07
/

CyberLLM-350M

+# ============================================================================
+# CyberLLM — 350M Parameter Configuration (Stretch Goal / v0.5 Target)
+# Architecture: LLaMA-3 Style (Decoder-Only Transformer)
+# ============================================================================
+model:
+  name: "cyberllm-350m"
+  architecture: "llama"
+  vocab_size: 32000
+  hidden_size: 1024
+  num_layers: 24
+  max_position_embeddings: 4096
+  num_attention_heads: 16
+  num_kv_heads: 4                  # GQA ratio: 16/4 = 4
+  head_dim: 64
+  intermediate_size: 2816          # SwiGLU FFN dim
+  hidden_act: "silu"
+  norm_type: "rmsnorm"
+  rms_norm_eps: 1.0e-5
+  position_encoding: "rope"
+  rope_theta: 10000.0
+  rope_scaling: null
+  tie_word_embeddings: true
+  attention_dropout: 0.0
+  hidden_dropout: 0.0
+  initializer_range: 0.02
+training:
+  optimizer: "adamw"
+  learning_rate: 3.0e-4
+  min_learning_rate: 3.0e-5
+  weight_decay: 0.1
+  adam_beta1: 0.9
+  adam_beta2: 0.95
+  adam_epsilon: 1.0e-8
+  max_grad_norm: 1.0
+  lr_scheduler: "cosine"
+  warmup_steps: 500
+  total_tokens: 5_000_000_000      # 5B tokens (~2 epochs)
+  micro_batch_size: 2              # Fits A40 48GB
+  gradient_accumulation_steps: 64  # Effective batch = 262K tokens/step
+  sequence_length: 2048            # 2048 for A40 memory
+  mixed_precision: "bf16"
+  save_interval_steps: 500
+  eval_interval_steps: 250
+  log_interval_steps: 10
+  keep_last_n_checkpoints: 3
+  data_mix:
+    stage_1:
+      general_text: 0.60
+      security_text: 0.30
+      code: 0.10
+    stage_2:
+      general_text: 0.30
+      security_text: 0.55
+      code: 0.15
+    stage_3:
+      general_text: 0.15
+      security_text: 0.70
+      code: 0.15
+infrastructure:
+  target_gpu: "a40_48gb"
+  num_gpus: 1
+  estimated_time_hours: 50
+  estimated_cost_usd: 10
+  local_device: "mps"
+  local_test_batch_size: 1
+  local_test_seq_length: 128
+# Expected parameter count: ~303M
+# Tokens per step: 2 * 64 * 2048 = 262,144
+# Total steps: ~19,073
+# A40 throughput: ~20-30K tok/s → ~50 hours
+# A100 throughput: ~50-70K tok/s → ~20 hours