Chennnnn commited on
Commit
2f59567
·
1 Parent(s): 73734a5

Initial model upload

Browse files
Files changed (3) hide show
  1. .gitattributes +2 -0
  2. checkpoint_step_30000.pt +3 -0
  3. config.py +87 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.gguf filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint_step_30000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bcd8e66d5157c6f1efac76c6acb83394fb3d276c2e716fecbde0f61413df9e9
3
+ size 4865779390
config.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class TrainConfig:
6
+ # Model core
7
+ vocab_size: int = 128010 # Llama3 tokenizer vocabulary size (including all special tokens)
8
+ embedding_dim: int = 1024 # factorized embedding dim (can be < d_model)
9
+ d_model: int = 1024
10
+ num_heads: int = 32
11
+ num_layers: int = 24 # Changed from 12 to 24 layers
12
+ max_seq_len: int = 1024
13
+
14
+ # Parameter sharing (ALBERT-style)
15
+ share_transformer_block: bool = True # share attention/FFN across layers
16
+ share_layernorms: bool = False # each layer has its own RMSNorm (as requested)
17
+
18
+ # Positional encoding (RoPE)
19
+ rope_base: float = 10000.0
20
+ rope_scale: float = 1.0
21
+
22
+ # Attention specifics
23
+ use_flash_attention: bool = True
24
+ qk_norm: bool = True # L2 normalize q and k before dot-product
25
+
26
+ # FFN / MoE
27
+ ffn_intermediate_dim: int = 4096 # shared expert intermediate dim
28
+ moe_num_experts: int = 64 # reduced from 128 to 64
29
+ moe_top_k: int = 2 # Increased from 2 to 3 for better expert utilization
30
+ moe_shared_expert: bool = True # always include shared expert
31
+ moe_expert_hidden_dim: int = 1024 # expert intermediate dim
32
+ moe_router_dropout: float = 0.1 # added dropout
33
+ moe_capacity_factor: float = 1.0
34
+ moe_router_temperature: float = 1.0
35
+ # Aux-free load balancing
36
+ aux_free_balance: bool = True # use auxiliary-loss-free router (Expert-Choice style)
37
+ moe_capacity_factor: float = 1.25 # capacity factor for aux-free routing
38
+ # Set aux coefficients to 0 when aux_free_balance is enabled
39
+ moe_router_zloss_coef: float = 0.0
40
+ moe_load_balance_coef: float = 0.0
41
+
42
+ # MoQ for query projection (mirrors MoE settings)
43
+ moq_num_experts: int = 64 # reduced from 128 to 64
44
+ moq_top_k: int = 2 # Increased from 2 to 3 for better query projection efficiency
45
+ moq_shared_expert: bool = True
46
+ moq_expert_hidden_dim: int = 1024 # MoQ hidden dimension
47
+ moq_router_temperature: float = 1.0
48
+
49
+ # Training
50
+ learning_rate: float = 3e-4
51
+ weight_decay: float = 0.01
52
+ betas: tuple = (0.9, 0.95)
53
+ eps: float = 1e-8
54
+ batch_size: int = 4 # reduced for max_seq_len=1024 to prevent OOM
55
+ grad_accum_steps: int = 8 # keep effective batch ~32 tokens per step
56
+ max_steps: int = 50000
57
+ warmup_steps: int = 10
58
+ clip_grad_norm: float = 1.0
59
+ mixed_precision: bool = True # enable for GPU training
60
+
61
+ # Checkpointing / logging
62
+ log_interval: int = 50
63
+ eval_interval: int = 5000
64
+ save_interval: int = 10000 # Save model every 10,000 steps
65
+ output_dir: str = "./outputs"
66
+
67
+ # Memory optimization
68
+ gradient_checkpointing: bool = True
69
+ cpu_offload: bool = False
70
+ flash_attention: bool = True
71
+
72
+ # Expert loading optimization for RTX 4090
73
+ max_loaded_experts: int = 40 # Optimized for RTX 4090 memory
74
+ expert_cache_strategy: str = "lru" # LRU cache strategy
75
+ expert_preload_threshold: int = 5 # Preload threshold for smart caching
76
+
77
+ # Early stopping
78
+ early_stopping_patience: int = 100
79
+ early_stopping_min_delta: float = 0.0001
80
+ early_stopping_monitor: str = "loss"
81
+
82
+ # Misc
83
+ seed: int = 42
84
+ device: str = "cuda"
85
+ dtype: str = "bfloat16" # autocast target (choices: float16|bfloat16|float32)
86
+
87
+