Spaces:
Sleeping
Sleeping
File size: 2,932 Bytes
97e312a 84f0b80 f45427d 97e312a f45427d b79954f 97e312a f45427d b79954f 97e312a f45427d b79954f 97e312a f45427d b79954f 97e312a f45427d b79954f 84f0b80 b79954f f45427d 84f0b80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
from state import Model
# https://huggingface.co/google/gemma-3-270m/blob/main/config.json
GEMMA3_270M = Model(
vocab_size=262144,
num_layers=18,
hidden_dim=640,
intermediate_size=2048,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_1B = Model(
vocab_size=262144,
num_layers=26,
hidden_dim=1152,
intermediate_size=6912,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_4B = Model(
vocab_size=262144,
num_layers=34,
hidden_dim=2560,
intermediate_size=10240,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_12B = Model(
vocab_size=262144,
num_layers=48,
hidden_dim=3840,
intermediate_size=15360,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
GEMMA3_27B = Model(
vocab_size=262144,
num_layers=62,
hidden_dim=5376,
intermediate_size=21504,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# No maverick, don't support non-homogenous layers yet
# https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct/blob/main/config.json
LLAMA4_SCOUT = Model(
vocab_size=202048,
num_layers=48,
hidden_dim=5120,
intermediate_size=8192,
weight_tied_embeddings=True,
active_experts=2,
total_experts=17,
is_moe=True,
)
# https://huggingface.co/unsloth/Llama-3.2-1B-Instruct/blob/main/config.json
LLAMA3_1B = Model(
vocab_size=128256,
num_layers=16,
hidden_dim=2048,
intermediate_size=8192,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# https://huggingface.co/unsloth/Llama-3.2-3B-Instruct/blob/main/config.json
LLAMA3_3B = Model(
vocab_size=128256,
num_layers=28,
hidden_dim=3072,
intermediate_size=8192,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# https://huggingface.co/unsloth/llama-3-8b-Instruct/blob/main/config.json
LLAMA3_8B = Model(
vocab_size=128256,
num_layers=32,
hidden_dim=4096,
intermediate_size=14336,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
# https://huggingface.co/unsloth/Llama-3.3-70B-Instruct/blob/main/config.json
LLAMA3_70B = Model(
vocab_size=128256,
num_layers=80,
hidden_dim=8192,
intermediate_size=28672,
weight_tied_embeddings=True,
active_experts=1,
total_experts=1,
is_moe=False,
)
DEFAULTS = {
"Gemma3 270M": GEMMA3_270M,
"Gemma3 1B": GEMMA3_1B,
"Gemma3 4B": GEMMA3_4B,
"Gemma3 12B": GEMMA3_12B,
"Gemma3 27B": GEMMA3_27B,
"Llama3 1B": LLAMA3_1B,
"Llama3 3B": LLAMA3_3B,
"Llama3 8B": LLAMA3_8B,
"Llama3 70B": LLAMA3_70B,
"Llama4 Scout": LLAMA4_SCOUT,
}
|