File size: 2,932 Bytes
97e312a
84f0b80
f45427d
97e312a
f45427d
 
 
 
 
 
 
 
b79954f
97e312a
f45427d
 
 
 
 
 
 
 
b79954f
97e312a
f45427d
 
 
 
 
 
 
 
b79954f
97e312a
f45427d
 
 
 
 
 
 
 
b79954f
97e312a
f45427d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b79954f
84f0b80
 
 
 
 
 
b79954f
f45427d
 
 
 
 
84f0b80
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from state import Model

# https://huggingface.co/google/gemma-3-270m/blob/main/config.json
GEMMA3_270M = Model(
    vocab_size=262144,
    num_layers=18,
    hidden_dim=640,
    intermediate_size=2048,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)
GEMMA3_1B = Model(
    vocab_size=262144,
    num_layers=26,
    hidden_dim=1152,
    intermediate_size=6912,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)
GEMMA3_4B = Model(
    vocab_size=262144,
    num_layers=34,
    hidden_dim=2560,
    intermediate_size=10240,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)
GEMMA3_12B = Model(
    vocab_size=262144,
    num_layers=48,
    hidden_dim=3840,
    intermediate_size=15360,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)
GEMMA3_27B = Model(
    vocab_size=262144,
    num_layers=62,
    hidden_dim=5376,
    intermediate_size=21504,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)
# No maverick, don't support non-homogenous layers yet

# https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/config.json
LLAMA4_SCOUT = Model(
    vocab_size=202048,
    num_layers=48,
    hidden_dim=5120,
    intermediate_size=8192,
    weight_tied_embeddings=True,
    active_experts=2,
    total_experts=17,
    is_moe=True,
)

# https://huggingface.co/unsloth/Llama-3.2-1B-Instruct/blob/main/config.json
LLAMA3_1B = Model(
    vocab_size=128256,
    num_layers=16,
    hidden_dim=2048,
    intermediate_size=8192,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)

# https://huggingface.co/unsloth/Llama-3.2-3B-Instruct/blob/main/config.json
LLAMA3_3B = Model(
    vocab_size=128256,
    num_layers=28,
    hidden_dim=3072,
    intermediate_size=8192,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)

# https://huggingface.co/unsloth/llama-3-8b-Instruct/blob/main/config.json
LLAMA3_8B = Model(
    vocab_size=128256,
    num_layers=32,
    hidden_dim=4096,
    intermediate_size=14336,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)

# https://huggingface.co/unsloth/Llama-3.3-70B-Instruct/blob/main/config.json
LLAMA3_70B = Model(
    vocab_size=128256,
    num_layers=80,
    hidden_dim=8192,
    intermediate_size=28672,
    weight_tied_embeddings=True,
    active_experts=1,
    total_experts=1,
    is_moe=False,
)

DEFAULTS = {
    "Gemma3 270M": GEMMA3_270M,
    "Gemma3 1B": GEMMA3_1B,
    "Gemma3 4B": GEMMA3_4B,
    "Gemma3 12B": GEMMA3_12B,
    "Gemma3 27B": GEMMA3_27B,
    "Llama3 1B": LLAMA3_1B,
    "Llama3 3B": LLAMA3_3B,
    "Llama3 8B": LLAMA3_8B,
    "Llama3 70B": LLAMA3_70B,
    "Llama4 Scout": LLAMA4_SCOUT,
}