Alyosha11 lckr bartowski commited on
Commit
40d10aa
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files

Co-authored-by: lckr <lckr@users.noreply.huggingface.co>
Co-authored-by: bartowski <bartowski@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +36 -0
  2. README.md +224 -0
  3. chat_template.jinja +65 -0
  4. config.json +511 -0
  5. configuration_afmoe.py +133 -0
  6. generation_config.json +7 -0
  7. model-00001-of-00081.safetensors +3 -0
  8. model-00002-of-00081.safetensors +3 -0
  9. model-00003-of-00081.safetensors +3 -0
  10. model-00004-of-00081.safetensors +3 -0
  11. model-00005-of-00081.safetensors +3 -0
  12. model-00006-of-00081.safetensors +3 -0
  13. model-00007-of-00081.safetensors +3 -0
  14. model-00008-of-00081.safetensors +3 -0
  15. model-00009-of-00081.safetensors +3 -0
  16. model-00010-of-00081.safetensors +3 -0
  17. model-00011-of-00081.safetensors +3 -0
  18. model-00012-of-00081.safetensors +3 -0
  19. model-00013-of-00081.safetensors +3 -0
  20. model-00014-of-00081.safetensors +3 -0
  21. model-00015-of-00081.safetensors +3 -0
  22. model-00016-of-00081.safetensors +3 -0
  23. model-00017-of-00081.safetensors +3 -0
  24. model-00018-of-00081.safetensors +3 -0
  25. model-00019-of-00081.safetensors +3 -0
  26. model-00020-of-00081.safetensors +3 -0
  27. model-00021-of-00081.safetensors +3 -0
  28. model-00022-of-00081.safetensors +3 -0
  29. model-00023-of-00081.safetensors +3 -0
  30. model-00024-of-00081.safetensors +3 -0
  31. model-00025-of-00081.safetensors +3 -0
  32. model-00026-of-00081.safetensors +3 -0
  33. model-00027-of-00081.safetensors +3 -0
  34. model-00028-of-00081.safetensors +3 -0
  35. model-00029-of-00081.safetensors +3 -0
  36. model-00030-of-00081.safetensors +3 -0
  37. model-00031-of-00081.safetensors +3 -0
  38. model-00032-of-00081.safetensors +3 -0
  39. model-00033-of-00081.safetensors +3 -0
  40. model-00034-of-00081.safetensors +3 -0
  41. model-00035-of-00081.safetensors +3 -0
  42. model-00036-of-00081.safetensors +3 -0
  43. model-00037-of-00081.safetensors +3 -0
  44. model-00038-of-00081.safetensors +3 -0
  45. model-00039-of-00081.safetensors +3 -0
  46. model-00040-of-00081.safetensors +3 -0
  47. model-00041-of-00081.safetensors +3 -0
  48. model-00042-of-00081.safetensors +3 -0
  49. model-00043-of-00081.safetensors +3 -0
  50. model-00044-of-00081.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ - es
6
+ - fr
7
+ - de
8
+ - it
9
+ - pt
10
+ - ru
11
+ - ar
12
+ - hi
13
+ - ko
14
+ - zh
15
+ library_name: transformers
16
+ base_model:
17
+ - arcee-ai/Trinity-Large-Preview
18
+ base_model_relation: quantized
19
+ ---
20
+ <!-- markdownlint-disable first-line-h1 -->
21
+ <!-- markdownlint-disable html -->
22
+ <!-- markdownlint-disable no-duplicate-header -->
23
+
24
+ <div align="center">
25
+ <picture>
26
+ <img
27
+ src="https://cdn-uploads.huggingface.co/production/uploads/6435718aaaef013d1aec3b8b/i-v1KyAMOW_mgVGeic9WJ.png"
28
+ alt="Arcee Trinity Large"
29
+ style="max-width: 100%; height: auto;"
30
+ >
31
+ </picture>
32
+ </div>
33
+ <hr>
34
+
35
+ # Trinity-Large-Preview-FP8
36
+
37
+ ## Introduction
38
+
39
+ Trinity-Large-Preview is a 398B-parameter sparse Mixture-of-Experts (MoE) model with approximately 13B active parameters per token. It is the largest model in Arcee AI's Trinity family, trained on more than 17 trillion tokens and delivering frontier-level performance with strong long-context comprehension.
40
+ Trinity-Large-Preview is a lightly post-trained model based on Trinity-Large-Base.
41
+
42
+ **This repository contains the FP8 quantized weights of Trinity-Large-Preview.**
43
+
44
+ Try it at [chat.arcee.ai](http://chat.arcee.ai/)
45
+
46
+ More details on the training of Trinity Large are available in the [technical report](https://github.com/arcee-ai/trinity-large-tech-report/).
47
+
48
+
49
+ ## Model Variants
50
+
51
+ The Trinity Large family consists of three checkpoints from the same training run:
52
+
53
+ - **[Trinity-Large-Preview](https://huggingface.co/arcee-ai/Trinity-Large-Preview)**: Lightly post-trained, chat-ready model undergoing active RL
54
+ - **[Trinity-Large-TrueBase](https://huggingface.co/arcee-ai/Trinity-Large-TrueBase)**: 10T-token pre-anneal pretraining checkpoint
55
+ - **[Trinity-Large-Base](https://huggingface.co/arcee-ai/Trinity-Large-Base)**: Full 17T-token pretrained foundation model with mid-training anneals
56
+
57
+ ## Architecture
58
+
59
+ Trinity-Large-Preview uses a sparse MoE configuration designed to maximize efficiency while maintaining large-scale capacity.
60
+
61
+ | Hyperparameter | Value |
62
+ |:---|:---:|
63
+ | Total parameters | ~398B |
64
+ | Active parameters per token | ~13B |
65
+ | Experts | 256 (1 shared) |
66
+ | Active experts | 4 |
67
+ | Routing strategy | 4-of-256 (1.56% sparsity) |
68
+ | Dense layers | 6 |
69
+ | Pretraining context length | 8,192 |
70
+ | Context length after extension | 512k |
71
+ | Architecture | Sparse MoE (AfmoeForCausalLM) |
72
+
73
+ ## Benchmarks
74
+
75
+ | Benchmark | Llama 4 Maverick | Trinity-Large Preview |
76
+ |-----------|------------------|----------------------|
77
+ | MMLU | 85.5 | 87.2 |
78
+ | MMLU-Pro | 80.5 | 75.2 |
79
+ | GPQA-Diamond | 69.8 | 63.3 |
80
+ | AIME 2025 | 19.3 | 24.0 |
81
+
82
+ ## Training Configuration
83
+
84
+ ### Pretraining
85
+
86
+ - Training tokens: 17 trillion
87
+ - Data partner: [Datology](https://www.datologyai.com/)
88
+
89
+ <div align="center">
90
+ <picture>
91
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/6435718aaaef013d1aec3b8b/sSVjGNHfrJKmQ6w8I18ek.png" style="background-color:ghostwhite;padding:5px;" width="17%" alt="Powered by Datology">
92
+ </picture>
93
+ </div>
94
+
95
+ ## Posttraining
96
+ - This checkpoint was instruction tuned on 20B tokens.
97
+
98
+ ### Infrastructure
99
+
100
+ - Hardware: 2,048 NVIDIA B300 GPUs
101
+ - Parallelism: HSDP + Expert Parallelism
102
+ - Compute partner: [Prime Intellect](https://www.primeintellect.ai/)
103
+
104
+
105
+ <div align="center">
106
+ <picture>
107
+ <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/61e020e4a343274bb132e138/H2mcdPRWtl4iKLd-OYYBc.jpeg" style="background-color:ghostwhite;padding:5px;" width="17%" alt="Powered by Prime Intellect">
108
+ </picture>
109
+ </div>
110
+
111
+ ## Usage
112
+
113
+ ### Running our model
114
+
115
+ - [Transformers](https://huggingface.co/arcee-ai/Trinity-Large-Preview#transformers)
116
+ - [VLLM](https://huggingface.co/arcee-ai/Trinity-Large-Preview#vllm)
117
+ - [llama.cpp](https://huggingface.co/arcee-ai/Trinity-Large-Preview#llamacpp)
118
+ - [LM Studio](https://huggingface.co/arcee-ai/Trinity-Large-Preview#lm-studio)
119
+ - [API](https://huggingface.co/arcee-ai/Trinity-Large-Preview#api)
120
+
121
+
122
+ ### Recommended settings
123
+ * temperature:
124
+ * top_k:
125
+ * top_p:
126
+ * min_p:
127
+
128
+ ### Transformers
129
+
130
+ Use the `main` transformers branch or pass `trust_remote_code=True` with a released version.
131
+
132
+ ```python
133
+ from transformers import AutoTokenizer, AutoModelForCausalLM
134
+ import torch
135
+
136
+ model_id = "arcee-ai/Trinity-Large-Preview"
137
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
138
+ model = AutoModelForCausalLM.from_pretrained(
139
+ model_id,
140
+ torch_dtype=torch.bfloat16,
141
+ device_map="auto",
142
+ trust_remote_code=True
143
+ )
144
+
145
+ messages = [
146
+ {"role": "user", "content": "Who are you?"},
147
+ ]
148
+
149
+ input_ids = tokenizer.apply_chat_template(
150
+ messages,
151
+ add_generation_prompt=True,
152
+ return_tensors="pt"
153
+ ).to(model.device)
154
+
155
+ outputs = model.generate(
156
+ input_ids,
157
+ max_new_tokens=256,
158
+ do_sample=True,
159
+ temperature=0.5,
160
+ top_k=50,
161
+ top_p=0.95
162
+ )
163
+
164
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
165
+ print(response)
166
+ ```
167
+
168
+ ### VLLM
169
+
170
+ Supported in VLLM release 0.11.1+
171
+
172
+ ```bash
173
+ vllm serve arcee-ai/Trinity-Large-Preview \
174
+ --dtype bfloat16 \
175
+ --enable-auto-tool-choice \
176
+ --tool-call-parser hermes
177
+ ```
178
+
179
+ ### llama.cpp
180
+
181
+ Supported in llama.cpp release b7061+
182
+
183
+ ```bash
184
+ llama-server -hf arcee-ai/Trinity-Large-Preview-GGUF:q4_k_m
185
+ ```
186
+
187
+ ### LM Studio
188
+
189
+ Supported in the latest LM Studio runtime. Search for `arcee-ai/Trinity-Large-Preview-GGUF` in Model Search.
190
+
191
+ ### API
192
+
193
+ Available on OpenRouter:
194
+
195
+ ```bash
196
+ curl -X POST "https://openrouter.ai/v1/chat/completions" \
197
+ -H "Authorization: Bearer $OPENROUTER_API_KEY" \
198
+ -H "Content-Type: application/json" \
199
+ -d '{
200
+ "model": "arcee-ai/trinity-large-preview",
201
+ "messages": [
202
+ {
203
+ "role": "user",
204
+ "content": "What are some fun things to do in New York?"
205
+ }
206
+ ]
207
+ }'
208
+ ```
209
+
210
+
211
+ ## License
212
+
213
+ Trinity-Large-Preview is released under the Apache License, Version 2.0.
214
+
215
+ ## Citation
216
+
217
+ ```bibtex
218
+ @misc{arcee_trinity_large_preview,
219
+ title = {Trinity-Large-Preview},
220
+ author = {{Arcee AI}},
221
+ year = {2026},
222
+ note = {398B sparse MoE model trained on 17T tokens}
223
+ }
224
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}{%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' }}
27
+ {% generation %}
28
+ {{- content}}
29
+ {%- if message.tool_calls %}
30
+ {%- for tool_call in message.tool_calls %}
31
+ {%- if (loop.first and content) or (not loop.first) %}
32
+ {{- '\n' }}
33
+ {%- endif %}
34
+ {%- if tool_call.function %}
35
+ {%- set tool_call = tool_call.function %}
36
+ {%- endif %}
37
+ {{- '<tool_call>\n{"name": "' }}
38
+ {{- tool_call.name }}
39
+ {{- '", "arguments": ' }}
40
+ {%- if tool_call.arguments is string %}
41
+ {{- tool_call.arguments }}
42
+ {%- else %}
43
+ {{- tool_call.arguments | tojson }}
44
+ {%- endif %}
45
+ {{- '}\n</tool_call>' }}
46
+ {%- endfor %}
47
+ {%- endif %}
48
+ {{- '<|im_end|>' }}
49
+ {% endgeneration%}
50
+ {{- '\n' }}
51
+ {%- elif message.role == "tool" %}
52
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
53
+ {{- '<|im_start|>user' }}
54
+ {%- endif %}
55
+ {{- '\n<tool_response>\n' }}
56
+ {{- content }}
57
+ {{- '\n</tool_response>' }}
58
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
59
+ {{- '<|im_end|>\n' }}
60
+ {%- endif %}
61
+ {%- endif %}
62
+ {%- endfor %}
63
+ {%- if add_generation_prompt %}
64
+ {{- '<|im_start|>assistant\n' }}
65
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AfmoeForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_afmoe.AfmoeConfig",
8
+ "AutoModel": "modeling_afmoe.AfmoeModel",
9
+ "AutoModelForCausalLM": "modeling_afmoe.AfmoeForCausalLM"
10
+ },
11
+ "bos_token_id": 0,
12
+ "dtype": "bfloat16",
13
+ "eos_token_id": 3,
14
+ "global_attn_every_n_layers": 4,
15
+ "head_dim": 128,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 3072,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 12288,
20
+ "layer_types": [
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "full_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "full_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "full_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "full_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "full_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "full_attention",
53
+ "sliding_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "full_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "sliding_attention",
60
+ "full_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "full_attention",
65
+ "sliding_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "full_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "sliding_attention",
72
+ "full_attention",
73
+ "sliding_attention",
74
+ "sliding_attention",
75
+ "sliding_attention",
76
+ "full_attention",
77
+ "sliding_attention",
78
+ "sliding_attention",
79
+ "sliding_attention",
80
+ "full_attention"
81
+ ],
82
+ "load_balance_coeff": 5e-05,
83
+ "max_position_embeddings": 262144,
84
+ "model_type": "afmoe",
85
+ "moe_intermediate_size": 3072,
86
+ "mup_enabled": true,
87
+ "n_group": 1,
88
+ "num_attention_heads": 48,
89
+ "num_dense_layers": 6,
90
+ "num_expert_groups": 1,
91
+ "num_experts": 256,
92
+ "num_experts_per_tok": 4,
93
+ "num_hidden_layers": 60,
94
+ "num_key_value_heads": 8,
95
+ "num_limited_groups": 1,
96
+ "num_shared_experts": 1,
97
+ "pad_token_id": 12,
98
+ "quantization_config": {
99
+ "config_groups": {
100
+ "group_0": {
101
+ "format": "float-quantized",
102
+ "input_activations": {
103
+ "actorder": null,
104
+ "block_structure": null,
105
+ "dynamic": true,
106
+ "group_size": null,
107
+ "num_bits": 8,
108
+ "observer": null,
109
+ "observer_kwargs": {},
110
+ "strategy": "token",
111
+ "symmetric": true,
112
+ "type": "float"
113
+ },
114
+ "output_activations": null,
115
+ "targets": [
116
+ "Linear"
117
+ ],
118
+ "weights": {
119
+ "actorder": null,
120
+ "block_structure": null,
121
+ "dynamic": false,
122
+ "group_size": null,
123
+ "num_bits": 8,
124
+ "observer": "minmax",
125
+ "observer_kwargs": {},
126
+ "strategy": "channel",
127
+ "symmetric": true,
128
+ "type": "float"
129
+ }
130
+ }
131
+ },
132
+ "format": "float-quantized",
133
+ "global_compression_ratio": null,
134
+ "ignore": [
135
+ "model.layers.0.self_attn.q_proj",
136
+ "model.layers.0.self_attn.k_proj",
137
+ "model.layers.0.self_attn.v_proj",
138
+ "model.layers.0.self_attn.o_proj",
139
+ "model.layers.0.self_attn.gate_proj",
140
+ "model.layers.1.self_attn.q_proj",
141
+ "model.layers.1.self_attn.k_proj",
142
+ "model.layers.1.self_attn.v_proj",
143
+ "model.layers.1.self_attn.o_proj",
144
+ "model.layers.1.self_attn.gate_proj",
145
+ "model.layers.2.self_attn.q_proj",
146
+ "model.layers.2.self_attn.k_proj",
147
+ "model.layers.2.self_attn.v_proj",
148
+ "model.layers.2.self_attn.o_proj",
149
+ "model.layers.2.self_attn.gate_proj",
150
+ "model.layers.3.self_attn.q_proj",
151
+ "model.layers.3.self_attn.k_proj",
152
+ "model.layers.3.self_attn.v_proj",
153
+ "model.layers.3.self_attn.o_proj",
154
+ "model.layers.3.self_attn.gate_proj",
155
+ "model.layers.4.self_attn.q_proj",
156
+ "model.layers.4.self_attn.k_proj",
157
+ "model.layers.4.self_attn.v_proj",
158
+ "model.layers.4.self_attn.o_proj",
159
+ "model.layers.4.self_attn.gate_proj",
160
+ "model.layers.5.self_attn.q_proj",
161
+ "model.layers.5.self_attn.k_proj",
162
+ "model.layers.5.self_attn.v_proj",
163
+ "model.layers.5.self_attn.o_proj",
164
+ "model.layers.5.self_attn.gate_proj",
165
+ "model.layers.6.self_attn.q_proj",
166
+ "model.layers.6.self_attn.k_proj",
167
+ "model.layers.6.self_attn.v_proj",
168
+ "model.layers.6.self_attn.o_proj",
169
+ "model.layers.6.self_attn.gate_proj",
170
+ "model.layers.6.mlp.router.gate",
171
+ "model.layers.7.self_attn.q_proj",
172
+ "model.layers.7.self_attn.k_proj",
173
+ "model.layers.7.self_attn.v_proj",
174
+ "model.layers.7.self_attn.o_proj",
175
+ "model.layers.7.self_attn.gate_proj",
176
+ "model.layers.7.mlp.router.gate",
177
+ "model.layers.8.self_attn.q_proj",
178
+ "model.layers.8.self_attn.k_proj",
179
+ "model.layers.8.self_attn.v_proj",
180
+ "model.layers.8.self_attn.o_proj",
181
+ "model.layers.8.self_attn.gate_proj",
182
+ "model.layers.8.mlp.router.gate",
183
+ "model.layers.9.self_attn.q_proj",
184
+ "model.layers.9.self_attn.k_proj",
185
+ "model.layers.9.self_attn.v_proj",
186
+ "model.layers.9.self_attn.o_proj",
187
+ "model.layers.9.self_attn.gate_proj",
188
+ "model.layers.9.mlp.router.gate",
189
+ "model.layers.10.self_attn.q_proj",
190
+ "model.layers.10.self_attn.k_proj",
191
+ "model.layers.10.self_attn.v_proj",
192
+ "model.layers.10.self_attn.o_proj",
193
+ "model.layers.10.self_attn.gate_proj",
194
+ "model.layers.10.mlp.router.gate",
195
+ "model.layers.11.self_attn.q_proj",
196
+ "model.layers.11.self_attn.k_proj",
197
+ "model.layers.11.self_attn.v_proj",
198
+ "model.layers.11.self_attn.o_proj",
199
+ "model.layers.11.self_attn.gate_proj",
200
+ "model.layers.11.mlp.router.gate",
201
+ "model.layers.12.self_attn.q_proj",
202
+ "model.layers.12.self_attn.k_proj",
203
+ "model.layers.12.self_attn.v_proj",
204
+ "model.layers.12.self_attn.o_proj",
205
+ "model.layers.12.self_attn.gate_proj",
206
+ "model.layers.12.mlp.router.gate",
207
+ "model.layers.13.self_attn.q_proj",
208
+ "model.layers.13.self_attn.k_proj",
209
+ "model.layers.13.self_attn.v_proj",
210
+ "model.layers.13.self_attn.o_proj",
211
+ "model.layers.13.self_attn.gate_proj",
212
+ "model.layers.13.mlp.router.gate",
213
+ "model.layers.14.self_attn.q_proj",
214
+ "model.layers.14.self_attn.k_proj",
215
+ "model.layers.14.self_attn.v_proj",
216
+ "model.layers.14.self_attn.o_proj",
217
+ "model.layers.14.self_attn.gate_proj",
218
+ "model.layers.14.mlp.router.gate",
219
+ "model.layers.15.self_attn.q_proj",
220
+ "model.layers.15.self_attn.k_proj",
221
+ "model.layers.15.self_attn.v_proj",
222
+ "model.layers.15.self_attn.o_proj",
223
+ "model.layers.15.self_attn.gate_proj",
224
+ "model.layers.15.mlp.router.gate",
225
+ "model.layers.16.self_attn.q_proj",
226
+ "model.layers.16.self_attn.k_proj",
227
+ "model.layers.16.self_attn.v_proj",
228
+ "model.layers.16.self_attn.o_proj",
229
+ "model.layers.16.self_attn.gate_proj",
230
+ "model.layers.16.mlp.router.gate",
231
+ "model.layers.17.self_attn.q_proj",
232
+ "model.layers.17.self_attn.k_proj",
233
+ "model.layers.17.self_attn.v_proj",
234
+ "model.layers.17.self_attn.o_proj",
235
+ "model.layers.17.self_attn.gate_proj",
236
+ "model.layers.17.mlp.router.gate",
237
+ "model.layers.18.self_attn.q_proj",
238
+ "model.layers.18.self_attn.k_proj",
239
+ "model.layers.18.self_attn.v_proj",
240
+ "model.layers.18.self_attn.o_proj",
241
+ "model.layers.18.self_attn.gate_proj",
242
+ "model.layers.18.mlp.router.gate",
243
+ "model.layers.19.self_attn.q_proj",
244
+ "model.layers.19.self_attn.k_proj",
245
+ "model.layers.19.self_attn.v_proj",
246
+ "model.layers.19.self_attn.o_proj",
247
+ "model.layers.19.self_attn.gate_proj",
248
+ "model.layers.19.mlp.router.gate",
249
+ "model.layers.20.self_attn.q_proj",
250
+ "model.layers.20.self_attn.k_proj",
251
+ "model.layers.20.self_attn.v_proj",
252
+ "model.layers.20.self_attn.o_proj",
253
+ "model.layers.20.self_attn.gate_proj",
254
+ "model.layers.20.mlp.router.gate",
255
+ "model.layers.21.self_attn.q_proj",
256
+ "model.layers.21.self_attn.k_proj",
257
+ "model.layers.21.self_attn.v_proj",
258
+ "model.layers.21.self_attn.o_proj",
259
+ "model.layers.21.self_attn.gate_proj",
260
+ "model.layers.21.mlp.router.gate",
261
+ "model.layers.22.self_attn.q_proj",
262
+ "model.layers.22.self_attn.k_proj",
263
+ "model.layers.22.self_attn.v_proj",
264
+ "model.layers.22.self_attn.o_proj",
265
+ "model.layers.22.self_attn.gate_proj",
266
+ "model.layers.22.mlp.router.gate",
267
+ "model.layers.23.self_attn.q_proj",
268
+ "model.layers.23.self_attn.k_proj",
269
+ "model.layers.23.self_attn.v_proj",
270
+ "model.layers.23.self_attn.o_proj",
271
+ "model.layers.23.self_attn.gate_proj",
272
+ "model.layers.23.mlp.router.gate",
273
+ "model.layers.24.self_attn.q_proj",
274
+ "model.layers.24.self_attn.k_proj",
275
+ "model.layers.24.self_attn.v_proj",
276
+ "model.layers.24.self_attn.o_proj",
277
+ "model.layers.24.self_attn.gate_proj",
278
+ "model.layers.24.mlp.router.gate",
279
+ "model.layers.25.self_attn.q_proj",
280
+ "model.layers.25.self_attn.k_proj",
281
+ "model.layers.25.self_attn.v_proj",
282
+ "model.layers.25.self_attn.o_proj",
283
+ "model.layers.25.self_attn.gate_proj",
284
+ "model.layers.25.mlp.router.gate",
285
+ "model.layers.26.self_attn.q_proj",
286
+ "model.layers.26.self_attn.k_proj",
287
+ "model.layers.26.self_attn.v_proj",
288
+ "model.layers.26.self_attn.o_proj",
289
+ "model.layers.26.self_attn.gate_proj",
290
+ "model.layers.26.mlp.router.gate",
291
+ "model.layers.27.self_attn.q_proj",
292
+ "model.layers.27.self_attn.k_proj",
293
+ "model.layers.27.self_attn.v_proj",
294
+ "model.layers.27.self_attn.o_proj",
295
+ "model.layers.27.self_attn.gate_proj",
296
+ "model.layers.27.mlp.router.gate",
297
+ "model.layers.28.self_attn.q_proj",
298
+ "model.layers.28.self_attn.k_proj",
299
+ "model.layers.28.self_attn.v_proj",
300
+ "model.layers.28.self_attn.o_proj",
301
+ "model.layers.28.self_attn.gate_proj",
302
+ "model.layers.28.mlp.router.gate",
303
+ "model.layers.29.self_attn.q_proj",
304
+ "model.layers.29.self_attn.k_proj",
305
+ "model.layers.29.self_attn.v_proj",
306
+ "model.layers.29.self_attn.o_proj",
307
+ "model.layers.29.self_attn.gate_proj",
308
+ "model.layers.29.mlp.router.gate",
309
+ "model.layers.30.self_attn.q_proj",
310
+ "model.layers.30.self_attn.k_proj",
311
+ "model.layers.30.self_attn.v_proj",
312
+ "model.layers.30.self_attn.o_proj",
313
+ "model.layers.30.self_attn.gate_proj",
314
+ "model.layers.30.mlp.router.gate",
315
+ "model.layers.31.self_attn.q_proj",
316
+ "model.layers.31.self_attn.k_proj",
317
+ "model.layers.31.self_attn.v_proj",
318
+ "model.layers.31.self_attn.o_proj",
319
+ "model.layers.31.self_attn.gate_proj",
320
+ "model.layers.31.mlp.router.gate",
321
+ "model.layers.32.self_attn.q_proj",
322
+ "model.layers.32.self_attn.k_proj",
323
+ "model.layers.32.self_attn.v_proj",
324
+ "model.layers.32.self_attn.o_proj",
325
+ "model.layers.32.self_attn.gate_proj",
326
+ "model.layers.32.mlp.router.gate",
327
+ "model.layers.33.self_attn.q_proj",
328
+ "model.layers.33.self_attn.k_proj",
329
+ "model.layers.33.self_attn.v_proj",
330
+ "model.layers.33.self_attn.o_proj",
331
+ "model.layers.33.self_attn.gate_proj",
332
+ "model.layers.33.mlp.router.gate",
333
+ "model.layers.34.self_attn.q_proj",
334
+ "model.layers.34.self_attn.k_proj",
335
+ "model.layers.34.self_attn.v_proj",
336
+ "model.layers.34.self_attn.o_proj",
337
+ "model.layers.34.self_attn.gate_proj",
338
+ "model.layers.34.mlp.router.gate",
339
+ "model.layers.35.self_attn.q_proj",
340
+ "model.layers.35.self_attn.k_proj",
341
+ "model.layers.35.self_attn.v_proj",
342
+ "model.layers.35.self_attn.o_proj",
343
+ "model.layers.35.self_attn.gate_proj",
344
+ "model.layers.35.mlp.router.gate",
345
+ "model.layers.36.self_attn.q_proj",
346
+ "model.layers.36.self_attn.k_proj",
347
+ "model.layers.36.self_attn.v_proj",
348
+ "model.layers.36.self_attn.o_proj",
349
+ "model.layers.36.self_attn.gate_proj",
350
+ "model.layers.36.mlp.router.gate",
351
+ "model.layers.37.self_attn.q_proj",
352
+ "model.layers.37.self_attn.k_proj",
353
+ "model.layers.37.self_attn.v_proj",
354
+ "model.layers.37.self_attn.o_proj",
355
+ "model.layers.37.self_attn.gate_proj",
356
+ "model.layers.37.mlp.router.gate",
357
+ "model.layers.38.self_attn.q_proj",
358
+ "model.layers.38.self_attn.k_proj",
359
+ "model.layers.38.self_attn.v_proj",
360
+ "model.layers.38.self_attn.o_proj",
361
+ "model.layers.38.self_attn.gate_proj",
362
+ "model.layers.38.mlp.router.gate",
363
+ "model.layers.39.self_attn.q_proj",
364
+ "model.layers.39.self_attn.k_proj",
365
+ "model.layers.39.self_attn.v_proj",
366
+ "model.layers.39.self_attn.o_proj",
367
+ "model.layers.39.self_attn.gate_proj",
368
+ "model.layers.39.mlp.router.gate",
369
+ "model.layers.40.self_attn.q_proj",
370
+ "model.layers.40.self_attn.k_proj",
371
+ "model.layers.40.self_attn.v_proj",
372
+ "model.layers.40.self_attn.o_proj",
373
+ "model.layers.40.self_attn.gate_proj",
374
+ "model.layers.40.mlp.router.gate",
375
+ "model.layers.41.self_attn.q_proj",
376
+ "model.layers.41.self_attn.k_proj",
377
+ "model.layers.41.self_attn.v_proj",
378
+ "model.layers.41.self_attn.o_proj",
379
+ "model.layers.41.self_attn.gate_proj",
380
+ "model.layers.41.mlp.router.gate",
381
+ "model.layers.42.self_attn.q_proj",
382
+ "model.layers.42.self_attn.k_proj",
383
+ "model.layers.42.self_attn.v_proj",
384
+ "model.layers.42.self_attn.o_proj",
385
+ "model.layers.42.self_attn.gate_proj",
386
+ "model.layers.42.mlp.router.gate",
387
+ "model.layers.43.self_attn.q_proj",
388
+ "model.layers.43.self_attn.k_proj",
389
+ "model.layers.43.self_attn.v_proj",
390
+ "model.layers.43.self_attn.o_proj",
391
+ "model.layers.43.self_attn.gate_proj",
392
+ "model.layers.43.mlp.router.gate",
393
+ "model.layers.44.self_attn.q_proj",
394
+ "model.layers.44.self_attn.k_proj",
395
+ "model.layers.44.self_attn.v_proj",
396
+ "model.layers.44.self_attn.o_proj",
397
+ "model.layers.44.self_attn.gate_proj",
398
+ "model.layers.44.mlp.router.gate",
399
+ "model.layers.45.self_attn.q_proj",
400
+ "model.layers.45.self_attn.k_proj",
401
+ "model.layers.45.self_attn.v_proj",
402
+ "model.layers.45.self_attn.o_proj",
403
+ "model.layers.45.self_attn.gate_proj",
404
+ "model.layers.45.mlp.router.gate",
405
+ "model.layers.46.self_attn.q_proj",
406
+ "model.layers.46.self_attn.k_proj",
407
+ "model.layers.46.self_attn.v_proj",
408
+ "model.layers.46.self_attn.o_proj",
409
+ "model.layers.46.self_attn.gate_proj",
410
+ "model.layers.46.mlp.router.gate",
411
+ "model.layers.47.self_attn.q_proj",
412
+ "model.layers.47.self_attn.k_proj",
413
+ "model.layers.47.self_attn.v_proj",
414
+ "model.layers.47.self_attn.o_proj",
415
+ "model.layers.47.self_attn.gate_proj",
416
+ "model.layers.47.mlp.router.gate",
417
+ "model.layers.48.self_attn.q_proj",
418
+ "model.layers.48.self_attn.k_proj",
419
+ "model.layers.48.self_attn.v_proj",
420
+ "model.layers.48.self_attn.o_proj",
421
+ "model.layers.48.self_attn.gate_proj",
422
+ "model.layers.48.mlp.router.gate",
423
+ "model.layers.49.self_attn.q_proj",
424
+ "model.layers.49.self_attn.k_proj",
425
+ "model.layers.49.self_attn.v_proj",
426
+ "model.layers.49.self_attn.o_proj",
427
+ "model.layers.49.self_attn.gate_proj",
428
+ "model.layers.49.mlp.router.gate",
429
+ "model.layers.50.self_attn.q_proj",
430
+ "model.layers.50.self_attn.k_proj",
431
+ "model.layers.50.self_attn.v_proj",
432
+ "model.layers.50.self_attn.o_proj",
433
+ "model.layers.50.self_attn.gate_proj",
434
+ "model.layers.50.mlp.router.gate",
435
+ "model.layers.51.self_attn.q_proj",
436
+ "model.layers.51.self_attn.k_proj",
437
+ "model.layers.51.self_attn.v_proj",
438
+ "model.layers.51.self_attn.o_proj",
439
+ "model.layers.51.self_attn.gate_proj",
440
+ "model.layers.51.mlp.router.gate",
441
+ "model.layers.52.self_attn.q_proj",
442
+ "model.layers.52.self_attn.k_proj",
443
+ "model.layers.52.self_attn.v_proj",
444
+ "model.layers.52.self_attn.o_proj",
445
+ "model.layers.52.self_attn.gate_proj",
446
+ "model.layers.52.mlp.router.gate",
447
+ "model.layers.53.self_attn.q_proj",
448
+ "model.layers.53.self_attn.k_proj",
449
+ "model.layers.53.self_attn.v_proj",
450
+ "model.layers.53.self_attn.o_proj",
451
+ "model.layers.53.self_attn.gate_proj",
452
+ "model.layers.53.mlp.router.gate",
453
+ "model.layers.54.self_attn.q_proj",
454
+ "model.layers.54.self_attn.k_proj",
455
+ "model.layers.54.self_attn.v_proj",
456
+ "model.layers.54.self_attn.o_proj",
457
+ "model.layers.54.self_attn.gate_proj",
458
+ "model.layers.54.mlp.router.gate",
459
+ "model.layers.55.self_attn.q_proj",
460
+ "model.layers.55.self_attn.k_proj",
461
+ "model.layers.55.self_attn.v_proj",
462
+ "model.layers.55.self_attn.o_proj",
463
+ "model.layers.55.self_attn.gate_proj",
464
+ "model.layers.55.mlp.router.gate",
465
+ "model.layers.56.self_attn.q_proj",
466
+ "model.layers.56.self_attn.k_proj",
467
+ "model.layers.56.self_attn.v_proj",
468
+ "model.layers.56.self_attn.o_proj",
469
+ "model.layers.56.self_attn.gate_proj",
470
+ "model.layers.56.mlp.router.gate",
471
+ "model.layers.57.self_attn.q_proj",
472
+ "model.layers.57.self_attn.k_proj",
473
+ "model.layers.57.self_attn.v_proj",
474
+ "model.layers.57.self_attn.o_proj",
475
+ "model.layers.57.self_attn.gate_proj",
476
+ "model.layers.57.mlp.router.gate",
477
+ "model.layers.58.self_attn.q_proj",
478
+ "model.layers.58.self_attn.k_proj",
479
+ "model.layers.58.self_attn.v_proj",
480
+ "model.layers.58.self_attn.o_proj",
481
+ "model.layers.58.self_attn.gate_proj",
482
+ "model.layers.58.mlp.router.gate",
483
+ "model.layers.59.self_attn.q_proj",
484
+ "model.layers.59.self_attn.k_proj",
485
+ "model.layers.59.self_attn.v_proj",
486
+ "model.layers.59.self_attn.o_proj",
487
+ "model.layers.59.self_attn.gate_proj",
488
+ "model.layers.59.mlp.router.gate",
489
+ "lm_head"
490
+ ],
491
+ "kv_cache_scheme": null,
492
+ "quant_method": "compressed-tensors",
493
+ "quantization_status": "compressed",
494
+ "sparsity_config": {},
495
+ "transform_config": {},
496
+ "version": "0.13.1.a20260116"
497
+ },
498
+ "rms_norm_eps": 1e-05,
499
+ "rope_scaling": null,
500
+ "rope_theta": 10000,
501
+ "route_norm": true,
502
+ "route_scale": 2.448,
503
+ "score_func": "sigmoid",
504
+ "sliding_window": 4096,
505
+ "tie_word_embeddings": false,
506
+ "topk_group": 1,
507
+ "transformers_version": "4.57.3",
508
+ "use_cache": true,
509
+ "use_grouped_mm": true,
510
+ "vocab_size": 200192
511
+ }
configuration_afmoe.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ from transformers.configuration_utils import PretrainedConfig
16
+ from transformers.modeling_rope_utils import rope_config_validation
17
+ from transformers.configuration_utils import layer_type_validation
18
+ from transformers.utils import logging
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+ class AfmoeConfig(PretrainedConfig):
23
+ """
24
+ n_group (`int`, *optional*, defaults to 1):
25
+ Number of groups for routed experts.
26
+ topk_group (`int`, *optional*, defaults to 1):
27
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
28
+ """
29
+ model_type = "afmoe"
30
+ base_model_pp_plan = {
31
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
32
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
33
+ "norm": (["hidden_states"], ["hidden_states"]),
34
+ }
35
+
36
+ def __init__(
37
+ self,
38
+ num_hidden_layers: int = 32,
39
+ vocab_size: int = 200192,
40
+ hidden_size: int = 2048,
41
+ intermediate_size: int = 6144,
42
+ moe_intermediate_size=1408,
43
+ num_dense_layers=1,
44
+ num_attention_heads=16,
45
+ num_key_value_heads=None,
46
+ head_dim=128,
47
+ hidden_act="silu",
48
+ max_position_embeddings=16384,
49
+ initializer_range=0.02,
50
+ rms_norm_eps=1e-5,
51
+ use_cache=True,
52
+ tie_word_embeddings=False,
53
+ rope_theta=10000.0,
54
+ rope_scaling=None,
55
+ num_experts=64,
56
+ num_experts_per_tok=6,
57
+ num_shared_experts=2,
58
+ num_expert_groups=1,
59
+ num_limited_groups=1,
60
+ score_func="sigmoid",
61
+ route_norm=True,
62
+ route_scale=1.0,
63
+ global_attn_every_n_layers=4,
64
+ sliding_window=1024,
65
+ mup_enabled=False,
66
+ layer_types=None,
67
+ attention_dropout: float = 0.0,
68
+ n_group: int = 1,
69
+ topk_group: int = 1,
70
+ **kwargs,
71
+ ):
72
+ self.vocab_size = vocab_size
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.hidden_size = hidden_size
75
+ self.intermediate_size = intermediate_size
76
+ self.num_hidden_layers = num_hidden_layers
77
+ self.num_dense_layers = num_dense_layers
78
+ self.num_attention_heads = num_attention_heads
79
+ self.head_dim = head_dim
80
+ self.hidden_act = hidden_act
81
+ self.initializer_range = initializer_range
82
+ self.rms_norm_eps = rms_norm_eps
83
+ self.use_cache = use_cache
84
+ self.rope_theta = rope_theta
85
+ self.rope_scaling = rope_scaling
86
+
87
+
88
+ # MoE specific
89
+ self.moe_intermediate_size = moe_intermediate_size
90
+ self.num_experts_per_tok = num_experts_per_tok
91
+ self.n_group = n_group
92
+ self.topk_group = topk_group
93
+ self.num_experts = num_experts
94
+ self.num_shared_experts = num_shared_experts
95
+ self.num_expert_groups = num_expert_groups
96
+ self.num_limited_groups = num_limited_groups
97
+ self.score_func = score_func
98
+ self.route_norm = route_norm
99
+ self.route_scale = route_scale
100
+
101
+
102
+ # Attention specific
103
+ self.attention_dropout = attention_dropout
104
+ self.global_attn_every_n_layers = global_attn_every_n_layers
105
+ self.sliding_window = sliding_window
106
+ self.layer_types = layer_types
107
+ if self.layer_types is None:
108
+ self.layer_types = [
109
+ "sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers)
110
+ ]
111
+ layer_type_validation(self.layer_types)
112
+
113
+ # muP specific
114
+ self.mup_enabled = mup_enabled
115
+
116
+ if num_key_value_heads is None:
117
+ num_key_value_heads = num_attention_heads
118
+
119
+ self.num_key_value_heads = num_key_value_heads
120
+
121
+
122
+ # Validate rope configs
123
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
124
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
125
+ rope_config_validation(self)
126
+
127
+ super().__init__(
128
+ tie_word_embeddings=tie_word_embeddings,
129
+ **kwargs,
130
+ )
131
+
132
+
133
+ __all__ = ["AfmoeConfig"]
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 3,
5
+ "pad_token_id": 12,
6
+ "transformers_version": "4.57.3"
7
+ }
model-00001-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dbbf27913667dd5daa8f3ab2d6caf94cc06a85f68b62aaed6e4b594ba5d9e87
3
+ size 4992710176
model-00002-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d40b6c07723fdcbde16cf0c078998965adcdf8d7c3e460f3266c72c4e1ddff3
3
+ size 4995652424
model-00003-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67881e1a723b4e32338a1919f82b4468e54615e58aa39dad33c7686053474b47
3
+ size 4990871376
model-00004-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c89b03fb780f45dc4d32f81d91cea59fad950203c5a34e2cb68e4120c3ad4e7
3
+ size 4990871488
model-00005-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f042de3d82a8efa37228ca8a588f823de94b648e4af2014db6effca9ad86fa
3
+ size 4970435832
model-00006-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96acbf691df5b6c6309785377fe9cd1a98f56128418c3aa6fafd9147f815322d
3
+ size 4997200848
model-00007-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b188aad182c1c66ab422a4bf0ce55ea959797645a73f6ec8edac746bacd5391b
3
+ size 4990873000
model-00008-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd324bc2598cef90e0ee2db3be91300af12c8fea9addd83d75c8d9222ca36e0c
3
+ size 4990872984
model-00009-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb52d6d4dd2981214ddcd9fa2682cb88f148c80c1b2aa57910a52b70dd3f7bf1
3
+ size 4995652984
model-00010-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a848fb367a760cc13b41ed5d52c11b090a8199f6188934cc176e058f0c5877
3
+ size 4990872464
model-00011-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08e4965fda9298555fa0c825eea19715575747ad57f7701f8d6dba8233839a15
3
+ size 4990872944
model-00012-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0135513fcb2f1e4a10c85cae23b4e644dc6bb11ab5f4423e7a06e6989051a4c2
3
+ size 4995653056
model-00013-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a86d89577344eb765450a27dad75005bad3eafb4f07ccd7f56318eaa54b4bb
3
+ size 4990872432
model-00014-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40f347e9fec7fb67e2c2cf737780e28384072eb7f74419e82e46cf6ae98ddc6a
3
+ size 4990872936
model-00015-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5624091fe7f150abd2e3e7084e54d1db84b8fa951d2a74f52ee4648784b92808
3
+ size 4995653096
model-00016-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c854b5c666838192a26ba8227575661ec3504bb9379bc365e0709609f027f15
3
+ size 4990872424
model-00017-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e96deaafe66fba4ea1d3910d7bab515b9d98b98595a23f995343a9a4de1dc28c
3
+ size 4990872904
model-00018-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35e7fea8257dc420395bc408f9cdb88ab788badeebfb9dec3178a0d5ad875c70
3
+ size 4995653136
model-00019-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df68b1cc5658a73cad254a05bc49c1ee4bf7fbe15b2d51794b5faf3444820ca
3
+ size 4990872424
model-00020-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afde029d6331842ca7eb9c0a2223ad9e390b1a035f875c3705d5516bc1b56f0e
3
+ size 4990872872
model-00021-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5083a325e9a6f68769da076016c8e70b3968ee2792a45e9375542dddde736c69
3
+ size 4995653168
model-00022-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c86cff8a0ef1f39c6b9673660b1a4b53463b49f411c6545a74c966813cb9ed3
3
+ size 4990872424
model-00023-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df878f4956784ad1869eef6cd01c9ae297182110bab4a6029fda7552e204caf2
3
+ size 4990872840
model-00024-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ad486d4fbf0002b4c302bdf73311ddbaef53c89e20cd3edd14d561f3576614
3
+ size 4995653200
model-00025-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c96b35be297a8334a61706101de1b58f7fc169710987eb274ea53b5541c67a8f
3
+ size 4990872424
model-00026-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:171808ce9c704536726581dd72dc98af7577db297409b916f5f4208b27390fe3
3
+ size 4990872808
model-00027-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70c0ec59c6e718d097caba0b828fd4f0de7f616fb39ab59649c16f165966ecf3
3
+ size 4995653240
model-00028-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d688e415e3d8a840d4e047fdd47bce7ad6f65c67f11852095d77baa7959ad5c3
3
+ size 4990872424
model-00029-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72d82006184cd7e045a48c0ea348abf22f52f3abae16e154bc9cfdc332965683
3
+ size 4990872768
model-00030-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d894424ddae10413b5e6dadbcbe25f44ab3a674e79a42d3522794daee7ce20b1
3
+ size 4995653272
model-00031-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0408c2fb5b843d1c96e63d1497c97cb59bf89732cb5ad4c014806d42621885b2
3
+ size 4990872424
model-00032-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0527a783f733553683df36c33c3100eb8539067773797bc2aed9dfa8f0e06bfb
3
+ size 4990872744
model-00033-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b1e190416b58a5a9a5b66d2abc9f8b7b0b177b4b103e9643fcc51522119210a
3
+ size 4995653304
model-00034-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81ffbd498ad533439be3ece295de1fb3a2ca8282df6b22a5abac392e58ca9163
3
+ size 4990872424
model-00035-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451710d6308b07538279db1b73c9fc00ef5b4b69fcc3beb2f88c2cfccf691d3c
3
+ size 4990872704
model-00036-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08115015dd88aeb65ae404b4f3ecbfc664d753dd314993e75246826e999f84f0
3
+ size 4995653336
model-00037-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f73ecc3c239deaf43edb4ca56950dcd412869fac73ab0111bd4ae26bb2e8cb51
3
+ size 4990872424
model-00038-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b7efd3ba1c2769b992823db255c1fd98fd32bb7e2f65c142cadc7da97923849
3
+ size 4990872672
model-00039-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d63c968f35f38009401c4aa07e10077667be53dd9aef608b590b91ff2486bce7
3
+ size 4995653376
model-00040-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2081924946e17d3e484859e3aef1d042ad30e7f9aa037888573af6d36b22c5d
3
+ size 4990872424
model-00041-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e61bd832509e0a22b88cfcaa713f6170a83d363201464b483e149e1969a86c
3
+ size 4990872640
model-00042-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201c1564640e56c8847d3a8bd89cd06d6e521dbb8757769177b2494832e308ab
3
+ size 4995653400
model-00043-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39b530176c8a49c149f20902e4a460c1a9322ab6bbc728eacae626711e5b5252
3
+ size 4990872424
model-00044-of-00081.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62c0794a387f4423576ca9cd7141d72bd89eff1e341765d6f15bcf056de3a5a0
3
+ size 4990872600