Omk07 commited on
Commit
5e143aa
·
verified ·
1 Parent(s): bb7d82e

Upload config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.yaml +90 -0
config.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================================
2
+ # CyberLLM — 350M Parameter Configuration (Stretch Goal / v0.5 Target)
3
+ # Architecture: LLaMA-3 Style (Decoder-Only Transformer)
4
+ # ============================================================================
5
+
6
+ model:
7
+ name: "cyberllm-350m"
8
+ architecture: "llama"
9
+
10
+ vocab_size: 32000
11
+ hidden_size: 1024
12
+ num_layers: 24
13
+ max_position_embeddings: 4096
14
+
15
+ num_attention_heads: 16
16
+ num_kv_heads: 4 # GQA ratio: 16/4 = 4
17
+ head_dim: 64
18
+
19
+ intermediate_size: 2816 # SwiGLU FFN dim
20
+ hidden_act: "silu"
21
+
22
+ norm_type: "rmsnorm"
23
+ rms_norm_eps: 1.0e-5
24
+
25
+ position_encoding: "rope"
26
+ rope_theta: 10000.0
27
+ rope_scaling: null
28
+
29
+ tie_word_embeddings: true
30
+ attention_dropout: 0.0
31
+ hidden_dropout: 0.0
32
+ initializer_range: 0.02
33
+
34
+
35
+ training:
36
+ optimizer: "adamw"
37
+ learning_rate: 3.0e-4
38
+ min_learning_rate: 3.0e-5
39
+ weight_decay: 0.1
40
+ adam_beta1: 0.9
41
+ adam_beta2: 0.95
42
+ adam_epsilon: 1.0e-8
43
+ max_grad_norm: 1.0
44
+
45
+ lr_scheduler: "cosine"
46
+ warmup_steps: 500
47
+ total_tokens: 5_000_000_000 # 5B tokens (~2 epochs)
48
+
49
+ micro_batch_size: 2 # Fits A40 48GB
50
+ gradient_accumulation_steps: 64 # Effective batch = 262K tokens/step
51
+ sequence_length: 2048 # 2048 for A40 memory
52
+
53
+ mixed_precision: "bf16"
54
+
55
+ save_interval_steps: 500
56
+ eval_interval_steps: 250
57
+ log_interval_steps: 10
58
+ keep_last_n_checkpoints: 3
59
+
60
+ data_mix:
61
+ stage_1:
62
+ general_text: 0.60
63
+ security_text: 0.30
64
+ code: 0.10
65
+ stage_2:
66
+ general_text: 0.30
67
+ security_text: 0.55
68
+ code: 0.15
69
+ stage_3:
70
+ general_text: 0.15
71
+ security_text: 0.70
72
+ code: 0.15
73
+
74
+
75
+ infrastructure:
76
+ target_gpu: "a40_48gb"
77
+ num_gpus: 1
78
+ estimated_time_hours: 50
79
+ estimated_cost_usd: 10
80
+
81
+ local_device: "mps"
82
+ local_test_batch_size: 1
83
+ local_test_seq_length: 128
84
+
85
+
86
+ # Expected parameter count: ~303M
87
+ # Tokens per step: 2 * 64 * 2048 = 262,144
88
+ # Total steps: ~19,073
89
+ # A40 throughput: ~20-30K tok/s → ~50 hours
90
+ # A100 throughput: ~50-70K tok/s → ~20 hours