Chennnnn
/

ChenZuoLM

Model card Files Files and versions

xet

Community

Chennnnn commited on Aug 31, 2025

Commit

2f59567

1 Parent(s): 73734a5

Initial model upload

Browse files

Files changed (3) hide show

.gitattributes +2 -0
checkpoint_step_30000.pt +3 -0
config.py +87 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoint_step_30000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2bcd8e66d5157c6f1efac76c6acb83394fb3d276c2e716fecbde0f61413df9e9
+size 4865779390

config.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from dataclasses import dataclass
+@dataclass
+class TrainConfig:
+    # Model core
+    vocab_size: int = 128010  # Llama3 tokenizer vocabulary size (including all special tokens)
+    embedding_dim: int = 1024  # factorized embedding dim (can be < d_model)
+    d_model: int = 1024
+    num_heads: int = 32
+    num_layers: int = 24  # Changed from 12 to 24 layers
+    max_seq_len: int = 1024
+    # Parameter sharing (ALBERT-style)
+    share_transformer_block: bool = True  # share attention/FFN across layers
+    share_layernorms: bool = False  # each layer has its own RMSNorm (as requested)
+    # Positional encoding (RoPE)
+    rope_base: float = 10000.0
+    rope_scale: float = 1.0
+    # Attention specifics
+    use_flash_attention: bool = True
+    qk_norm: bool = True  # L2 normalize q and k before dot-product
+    # FFN / MoE
+    ffn_intermediate_dim: int = 4096  # shared expert intermediate dim
+    moe_num_experts: int = 64  # reduced from 128 to 64
+    moe_top_k: int = 2  # Increased from 2 to 3 for better expert utilization
+    moe_shared_expert: bool = True  # always include shared expert
+    moe_expert_hidden_dim: int = 1024  # expert intermediate dim
+    moe_router_dropout: float = 0.1  # added dropout
+    moe_capacity_factor: float = 1.0
+    moe_router_temperature: float = 1.0
+    # Aux-free load balancing
+    aux_free_balance: bool = True  # use auxiliary-loss-free router (Expert-Choice style)
+    moe_capacity_factor: float = 1.25  # capacity factor for aux-free routing
+    # Set aux coefficients to 0 when aux_free_balance is enabled
+    moe_router_zloss_coef: float = 0.0
+    moe_load_balance_coef: float = 0.0
+    # MoQ for query projection (mirrors MoE settings)
+    moq_num_experts: int = 64  # reduced from 128 to 64
+    moq_top_k: int = 2  # Increased from 2 to 3 for better query projection efficiency
+    moq_shared_expert: bool = True
+    moq_expert_hidden_dim: int = 1024  # MoQ hidden dimension
+    moq_router_temperature: float = 1.0
+    # Training
+    learning_rate: float = 3e-4
+    weight_decay: float = 0.01
+    betas: tuple = (0.9, 0.95)
+    eps: float = 1e-8
+    batch_size: int = 4  # reduced for max_seq_len=1024 to prevent OOM
+    grad_accum_steps: int = 8  # keep effective batch ~32 tokens per step
+    max_steps: int = 50000
+    warmup_steps: int = 10
+    clip_grad_norm: float = 1.0
+    mixed_precision: bool = True  # enable for GPU training
+    # Checkpointing / logging
+    log_interval: int = 50
+    eval_interval: int = 5000
+    save_interval: int = 10000  # Save model every 10,000 steps
+    output_dir: str = "./outputs"
+    # Memory optimization
+    gradient_checkpointing: bool = True
+    cpu_offload: bool = False
+    flash_attention: bool = True
+    # Expert loading optimization for RTX 4090
+    max_loaded_experts: int = 40  # Optimized for RTX 4090 memory
+    expert_cache_strategy: str = "lru"  # LRU cache strategy
+    expert_preload_threshold: int = 5  # Preload threshold for smart caching
+    # Early stopping
+    early_stopping_patience: int = 100
+    early_stopping_min_delta: float = 0.0001
+    early_stopping_monitor: str = "loss"
+    # Misc
+    seed: int = 42
+    device: str = "cuda"
+    dtype: str = "bfloat16"  # autocast target (choices: float16|bfloat16|float32)