Commit ·
6c33626
0
Parent(s):
Super-squash branch 'main' using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +36 -0
- README.md +142 -0
- __init__.py +4 -0
- chat_template.jinja +159 -0
- config.json +515 -0
- configuration_afmoe.py +133 -0
- generation_config.json +10 -0
- model-00001-of-00081.safetensors +3 -0
- model-00002-of-00081.safetensors +3 -0
- model-00003-of-00081.safetensors +3 -0
- model-00004-of-00081.safetensors +3 -0
- model-00005-of-00081.safetensors +3 -0
- model-00006-of-00081.safetensors +3 -0
- model-00007-of-00081.safetensors +3 -0
- model-00008-of-00081.safetensors +3 -0
- model-00009-of-00081.safetensors +3 -0
- model-00010-of-00081.safetensors +3 -0
- model-00011-of-00081.safetensors +3 -0
- model-00012-of-00081.safetensors +3 -0
- model-00013-of-00081.safetensors +3 -0
- model-00014-of-00081.safetensors +3 -0
- model-00015-of-00081.safetensors +3 -0
- model-00016-of-00081.safetensors +3 -0
- model-00017-of-00081.safetensors +3 -0
- model-00018-of-00081.safetensors +3 -0
- model-00019-of-00081.safetensors +3 -0
- model-00020-of-00081.safetensors +3 -0
- model-00021-of-00081.safetensors +3 -0
- model-00022-of-00081.safetensors +3 -0
- model-00023-of-00081.safetensors +3 -0
- model-00024-of-00081.safetensors +3 -0
- model-00025-of-00081.safetensors +3 -0
- model-00026-of-00081.safetensors +3 -0
- model-00027-of-00081.safetensors +3 -0
- model-00028-of-00081.safetensors +3 -0
- model-00029-of-00081.safetensors +3 -0
- model-00030-of-00081.safetensors +3 -0
- model-00031-of-00081.safetensors +3 -0
- model-00032-of-00081.safetensors +3 -0
- model-00033-of-00081.safetensors +3 -0
- model-00034-of-00081.safetensors +3 -0
- model-00035-of-00081.safetensors +3 -0
- model-00036-of-00081.safetensors +3 -0
- model-00037-of-00081.safetensors +3 -0
- model-00038-of-00081.safetensors +3 -0
- model-00039-of-00081.safetensors +3 -0
- model-00040-of-00081.safetensors +3 -0
- model-00041-of-00081.safetensors +3 -0
- model-00042-of-00081.safetensors +3 -0
- model-00043-of-00081.safetensors +3 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
- es
|
| 6 |
+
- fr
|
| 7 |
+
- de
|
| 8 |
+
- it
|
| 9 |
+
- pt
|
| 10 |
+
- ru
|
| 11 |
+
- ar
|
| 12 |
+
- hi
|
| 13 |
+
- ko
|
| 14 |
+
- zh
|
| 15 |
+
library_name: transformers
|
| 16 |
+
base_model:
|
| 17 |
+
- arcee-ai/Trinity-Large-Thinking
|
| 18 |
+
base_model_relation: quantized
|
| 19 |
+
tags:
|
| 20 |
+
- reasoning
|
| 21 |
+
- agentic
|
| 22 |
+
- tool-calling
|
| 23 |
+
- thinking
|
| 24 |
+
---
|
| 25 |
+
<!-- markdownlint-disable first-line-h1 -->
|
| 26 |
+
<!-- markdownlint-disable html -->
|
| 27 |
+
<!-- markdownlint-disable no-duplicate-header -->
|
| 28 |
+
|
| 29 |
+
<div align="center">
|
| 30 |
+
<picture>
|
| 31 |
+
<img
|
| 32 |
+
src="https://cdn-uploads.huggingface.co/production/uploads/6435718aaaef013d1aec3b8b/i-v1KyAMOW_mgVGeic9WJ.png"
|
| 33 |
+
alt="Arcee Trinity Large Thinking"
|
| 34 |
+
style="max-width: 100%; height: auto;"
|
| 35 |
+
>
|
| 36 |
+
</picture>
|
| 37 |
+
</div>
|
| 38 |
+
<hr>
|
| 39 |
+
|
| 40 |
+
# Trinity-Large-Thinking-FP8-Block
|
| 41 |
+
|
| 42 |
+
## Introduction
|
| 43 |
+
|
| 44 |
+
Trinity-Large-Thinking is a reasoning-optimized variant of Arcee AI's Trinity-Large family — a 398B-parameter sparse Mixture-of-Experts (MoE) model with approximately 13B active parameters per token, post-trained with extended chain-of-thought reasoning and agentic RL.
|
| 45 |
+
|
| 46 |
+
**This repository contains the FP8 block-quantized weights of Trinity-Large-Thinking (FP8 weights and activations with per-block scaling).**
|
| 47 |
+
|
| 48 |
+
For full model details, benchmarks, and usage guidance, see the main [Trinity-Large-Thinking](https://huggingface.co/arcee-ai/Trinity-Large-Thinking) model card.
|
| 49 |
+
|
| 50 |
+
## Quantization Details
|
| 51 |
+
|
| 52 |
+
- **Scheme:** `FP8 Block` (FP8 weights and activations, per-block scaling with E8M0 scale format)
|
| 53 |
+
- **Format:** `compressed-tensors`
|
| 54 |
+
- **Intended use:** High-throughput FP8 deployment with near-lossless quality, optimized for NVIDIA Hopper/Blackwell GPUs
|
| 55 |
+
- **Supported backends:** [DeepGEMM](https://github.com/deepseek-ai/DeepGEMM), vLLM CUTLASS, Triton
|
| 56 |
+
|
| 57 |
+
## Usage
|
| 58 |
+
|
| 59 |
+
### Inference tested on
|
| 60 |
+
|
| 61 |
+
- 8x NVIDIA H100 80GB (tensor parallel = 8)
|
| 62 |
+
- vLLM 0.18.0+
|
| 63 |
+
|
| 64 |
+
### vLLM
|
| 65 |
+
|
| 66 |
+
Supported in vLLM 0.18.0+ with DeepGEMM FP8 MoE acceleration.
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
pip install "vllm>=0.18.0"
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
Serving with DeepGEMM enabled (recommended):
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
VLLM_USE_DEEP_GEMM=1 vllm serve arcee-ai/Trinity-Large-Thinking-FP8-Block \
|
| 76 |
+
--trust-remote-code \
|
| 77 |
+
--tensor-parallel-size 8 \
|
| 78 |
+
--enable-reasoning \
|
| 79 |
+
--reasoning-parser deepseek_r1 \
|
| 80 |
+
--enable-auto-tool-choice \
|
| 81 |
+
--tool-call-parser qwen3_coder
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
Without DeepGEMM (falls back to CUTLASS/Triton):
|
| 85 |
+
|
| 86 |
+
```bash
|
| 87 |
+
vllm serve arcee-ai/Trinity-Large-Thinking-FP8-Block \
|
| 88 |
+
--trust-remote-code \
|
| 89 |
+
--tensor-parallel-size 8 \
|
| 90 |
+
--enable-reasoning \
|
| 91 |
+
--reasoning-parser deepseek_r1 \
|
| 92 |
+
--enable-auto-tool-choice \
|
| 93 |
+
--tool-call-parser qwen3_coder
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
### Transformers
|
| 97 |
+
|
| 98 |
+
```python
|
| 99 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 100 |
+
|
| 101 |
+
model_id = "arcee-ai/Trinity-Large-Thinking-FP8-Block"
|
| 102 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 103 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 104 |
+
model_id,
|
| 105 |
+
device_map="auto",
|
| 106 |
+
trust_remote_code=True
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
messages = [{"role": "user", "content": "Who are you?"}]
|
| 110 |
+
|
| 111 |
+
input_ids = tokenizer.apply_chat_template(
|
| 112 |
+
messages, add_generation_prompt=True, return_tensors="pt"
|
| 113 |
+
).to(model.device)
|
| 114 |
+
|
| 115 |
+
outputs = model.generate(input_ids, max_new_tokens=4096, do_sample=True, temperature=0.6, top_k=50, top_p=0.95)
|
| 116 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### API
|
| 120 |
+
|
| 121 |
+
Works out of the box on [OpenRouter](https://openrouter.ai/) as `arcee-ai/trinity-large-thinking`.
|
| 122 |
+
|
| 123 |
+
## License
|
| 124 |
+
|
| 125 |
+
Trinity-Large-Thinking-FP8-Block is released under the Apache License, Version 2.0.
|
| 126 |
+
|
| 127 |
+
## Citation
|
| 128 |
+
|
| 129 |
+
If you use this model, please cite:
|
| 130 |
+
|
| 131 |
+
```bibtex
|
| 132 |
+
@misc{singh2026arceetrinity,
|
| 133 |
+
title = {Arcee Trinity Large Technical Report},
|
| 134 |
+
author = {Varun Singh and Lucas Krauss and Sami Jaghouar and Matej Sirovatka and Charles Goddard and Fares Obied and Jack Min Ong and Jannik Straube and Fern and Aria Harley and Conner Stewart and Colin Kealty and Maziyar Panahi and Simon Kirsten and Anushka Deshpande and Anneketh Vij and Arthur Bresnu and Pranav Veldurthi and Raghav Ravishankar and Hardik Bishnoi and DatologyAI Team and Arcee AI Team and Prime Intellect Team and Mark McQuade and Johannes Hagemann and Lucas Atkins},
|
| 135 |
+
year = {2026},
|
| 136 |
+
eprint = {2602.17004},
|
| 137 |
+
archivePrefix= {arXiv},
|
| 138 |
+
primaryClass = {cs.LG},
|
| 139 |
+
doi = {10.48550/arXiv.2602.17004},
|
| 140 |
+
url = {https://arxiv.org/abs/2602.17004}
|
| 141 |
+
}
|
| 142 |
+
```
|
__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .configuration_afmoe import AfmoeConfig
|
| 2 |
+
from .modeling_afmoe import AfmoeForCausalLM
|
| 3 |
+
|
| 4 |
+
__all__ = ["AfmoeConfig", "AfmoeForCausalLM"]
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<|begin_of_text|>{%- macro render_extra_keys(json_dict, handled_keys) -%}
|
| 2 |
+
{%- if json_dict is mapping %}
|
| 3 |
+
{%- for json_key in json_dict if json_key not in handled_keys %}
|
| 4 |
+
{%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
|
| 5 |
+
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
|
| 6 |
+
{%- else %}
|
| 7 |
+
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
|
| 8 |
+
{%- endif %}
|
| 9 |
+
{%- endfor %}
|
| 10 |
+
{%- endif %}
|
| 11 |
+
{%- endmacro -%}
|
| 12 |
+
|
| 13 |
+
{%- macro render_tool_call(raw_tool_call) -%}
|
| 14 |
+
{%- if raw_tool_call.function is defined and raw_tool_call.function is mapping %}
|
| 15 |
+
{%- set tool_call = raw_tool_call.function %}
|
| 16 |
+
{%- else %}
|
| 17 |
+
{%- set tool_call = raw_tool_call %}
|
| 18 |
+
{%- endif %}
|
| 19 |
+
{{- '<tool_call>\n<function=' + (tool_call.name | default('') | string) + '>\n' }}
|
| 20 |
+
{%- if tool_call.arguments is defined and tool_call.arguments is mapping %}
|
| 21 |
+
{%- for args_name, args_value in tool_call.arguments.items() %}
|
| 22 |
+
{{- '<parameter=' + (args_name | string) + '>\n' }}
|
| 23 |
+
{%- if args_value is mapping or (args_value is sequence and args_value is not string) %}
|
| 24 |
+
{{- args_value | tojson | safe }}
|
| 25 |
+
{%- else %}
|
| 26 |
+
{{- args_value | string }}
|
| 27 |
+
{%- endif %}
|
| 28 |
+
{{- '\n</parameter>\n' }}
|
| 29 |
+
{%- endfor %}
|
| 30 |
+
{%- endif %}
|
| 31 |
+
{{- '</function>\n</tool_call>' }}
|
| 32 |
+
{%- endmacro -%}
|
| 33 |
+
|
| 34 |
+
{%- set system_message = none %}
|
| 35 |
+
{%- if messages and messages[0]["role"] == "system" %}
|
| 36 |
+
{%- set system_message = messages[0]["content"] %}
|
| 37 |
+
{%- set loop_messages = messages[1:] %}
|
| 38 |
+
{%- else %}
|
| 39 |
+
{%- set loop_messages = messages %}
|
| 40 |
+
{%- endif %}
|
| 41 |
+
|
| 42 |
+
{%- if not tools is defined %}
|
| 43 |
+
{%- set tools = [] %}
|
| 44 |
+
{%- endif %}
|
| 45 |
+
{%- set has_tools = tools is iterable and tools is not string and tools | length > 0 %}
|
| 46 |
+
|
| 47 |
+
{%- if system_message is not none or has_tools %}
|
| 48 |
+
{{- '<|im_start|>system\n' }}
|
| 49 |
+
{%- if system_message is not none %}
|
| 50 |
+
{{- system_message }}
|
| 51 |
+
{%- else %}
|
| 52 |
+
{{- "You are Trinity Large, a helpful assistant developed by Arcee AI, that can interact with a computer to solve tasks." }}
|
| 53 |
+
{%- endif %}
|
| 54 |
+
{%- if has_tools %}
|
| 55 |
+
{{- "\n\n# Tools\n\nYou have access to the following functions:\n\n<tools>" }}
|
| 56 |
+
{%- for tool in tools %}
|
| 57 |
+
{%- if tool.function is defined and tool.function is mapping %}
|
| 58 |
+
{%- set tool = tool.function %}
|
| 59 |
+
{%- endif %}
|
| 60 |
+
{{- '\n<function>\n<name>' ~ (tool.name | default('') | string) ~ '</name>' }}
|
| 61 |
+
{%- if tool.description is defined and tool.description is not none %}
|
| 62 |
+
{{- '\n<description>' ~ (tool.description | string | trim) ~ '</description>' }}
|
| 63 |
+
{%- endif %}
|
| 64 |
+
{{- '\n<parameters>' }}
|
| 65 |
+
{%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
|
| 66 |
+
{%- for param_name, param_fields in tool.parameters.properties.items() %}
|
| 67 |
+
{{- '\n<parameter>\n<name>' ~ (param_name | string) ~ '</name>' }}
|
| 68 |
+
{%- if param_fields is mapping and param_fields.type is defined and param_fields.type is not none %}
|
| 69 |
+
{{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
|
| 70 |
+
{%- endif %}
|
| 71 |
+
{%- if param_fields is mapping and param_fields.description is defined and param_fields.description is not none %}
|
| 72 |
+
{{- '\n<description>' ~ (param_fields.description | string | trim) ~ '</description>' }}
|
| 73 |
+
{%- endif %}
|
| 74 |
+
{%- if param_fields is mapping %}
|
| 75 |
+
{%- set handled_keys = ['name', 'type', 'description'] %}
|
| 76 |
+
{{- render_extra_keys(param_fields, handled_keys) }}
|
| 77 |
+
{%- endif %}
|
| 78 |
+
{{- '\n</parameter>' }}
|
| 79 |
+
{%- endfor %}
|
| 80 |
+
{%- endif %}
|
| 81 |
+
{%- if tool.parameters is defined %}
|
| 82 |
+
{%- set handled_keys = ['type', 'properties'] %}
|
| 83 |
+
{{- render_extra_keys(tool.parameters, handled_keys) }}
|
| 84 |
+
{%- endif %}
|
| 85 |
+
{{- '\n</parameters>' }}
|
| 86 |
+
{%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
|
| 87 |
+
{{- render_extra_keys(tool, handled_keys) }}
|
| 88 |
+
{{- '\n</function>' }}
|
| 89 |
+
{%- endfor %}
|
| 90 |
+
{{- "\n</tools>" }}
|
| 91 |
+
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
|
| 92 |
+
{%- endif %}
|
| 93 |
+
{{- '<|im_end|>\n' }}
|
| 94 |
+
{%- endif %}
|
| 95 |
+
|
| 96 |
+
{%- for message in loop_messages %}
|
| 97 |
+
{%- set role = message.role | default('') %}
|
| 98 |
+
{%- if role == "assistant" %}
|
| 99 |
+
{%- set content_str = '' if message.content is none else (message.content | string) %}
|
| 100 |
+
{%- set trimmed_content = content_str | trim %}
|
| 101 |
+
|
| 102 |
+
{%- set has_reasoning_content = message.reasoning_content is defined %}
|
| 103 |
+
{%- set has_reasoning = has_reasoning_content or (message.reasoning is defined) %}
|
| 104 |
+
|
| 105 |
+
{%- if has_reasoning_content %}
|
| 106 |
+
{%- set reasoning_value = message.reasoning_content %}
|
| 107 |
+
{%- elif message.reasoning is defined %}
|
| 108 |
+
{%- set reasoning_value = message.reasoning %}
|
| 109 |
+
{%- else %}
|
| 110 |
+
{%- set reasoning_value = none %}
|
| 111 |
+
{%- endif %}
|
| 112 |
+
|
| 113 |
+
{%- set has_tool_calls = message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls is not string and message.tool_calls | length > 0 %}
|
| 114 |
+
|
| 115 |
+
{{- '<|im_start|>assistant\n' }}
|
| 116 |
+
{%- if has_reasoning %}
|
| 117 |
+
{%- if reasoning_value %}
|
| 118 |
+
{{- '<think>' + (reasoning_value | string | trim) + '</think>' }}
|
| 119 |
+
{%- else %}
|
| 120 |
+
{{- '<think></think>' }}
|
| 121 |
+
{%- endif %}
|
| 122 |
+
{%- if trimmed_content %}
|
| 123 |
+
{{- '\n' + trimmed_content }}
|
| 124 |
+
{%- endif %}
|
| 125 |
+
{%- elif has_tool_calls %}
|
| 126 |
+
{%- if trimmed_content %}
|
| 127 |
+
{{- trimmed_content }}
|
| 128 |
+
{%- endif %}
|
| 129 |
+
{%- else %}
|
| 130 |
+
{{- content_str }}
|
| 131 |
+
{%- endif %}
|
| 132 |
+
|
| 133 |
+
{%- if has_tool_calls %}
|
| 134 |
+
{%- for tool_call in message.tool_calls %}
|
| 135 |
+
{%- set separator = '\n' if ((loop.first and (has_reasoning or trimmed_content)) or (not loop.first)) else '' -%}
|
| 136 |
+
{{- separator + render_tool_call(tool_call) }}
|
| 137 |
+
{%- endfor %}
|
| 138 |
+
{%- endif %}
|
| 139 |
+
{{- '<|im_end|>\n' }}
|
| 140 |
+
{%- elif role == "tool" or role == "observation" or role == "function" %}
|
| 141 |
+
{%- if loop.first or loop.previtem.role not in ["tool", "observation", "function"] %}
|
| 142 |
+
{{- '<|im_start|>user\n' }}
|
| 143 |
+
{%- endif %}
|
| 144 |
+
{{- '<tool_response>\n' }}
|
| 145 |
+
{{- '' if message.content is none else (message.content | string) }}
|
| 146 |
+
{{- '\n</tool_response>\n' }}
|
| 147 |
+
{%- if loop.last or loop.nextitem.role not in ["tool", "observation", "function"] %}
|
| 148 |
+
{{- '<|im_end|>\n' }}
|
| 149 |
+
{%- endif %}
|
| 150 |
+
{%- else %}
|
| 151 |
+
{{- '<|im_start|>' + (role | string) }}
|
| 152 |
+
{{- '\n' + ('' if message.content is none else (message.content | string)) }}
|
| 153 |
+
{{- '<|im_end|>\n' }}
|
| 154 |
+
{%- endif %}
|
| 155 |
+
{%- endfor %}
|
| 156 |
+
|
| 157 |
+
{%- if add_generation_prompt %}
|
| 158 |
+
{{- '<|im_start|>assistant\n<think>' }}
|
| 159 |
+
{%- endif %}
|
config.json
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"AfmoeForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"auto_map": {
|
| 7 |
+
"AutoConfig": "configuration_afmoe.AfmoeConfig",
|
| 8 |
+
"AutoModel": "modeling_afmoe.AfmoeModel",
|
| 9 |
+
"AutoModelForCausalLM": "modeling_afmoe.AfmoeForCausalLM"
|
| 10 |
+
},
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"global_attn_every_n_layers": 4,
|
| 13 |
+
"head_dim": 128,
|
| 14 |
+
"hidden_act": "silu",
|
| 15 |
+
"hidden_size": 3072,
|
| 16 |
+
"initializer_range": 0.02,
|
| 17 |
+
"intermediate_size": 12288,
|
| 18 |
+
"layer_types": [
|
| 19 |
+
"sliding_attention",
|
| 20 |
+
"sliding_attention",
|
| 21 |
+
"sliding_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"sliding_attention",
|
| 24 |
+
"sliding_attention",
|
| 25 |
+
"sliding_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"sliding_attention",
|
| 28 |
+
"sliding_attention",
|
| 29 |
+
"sliding_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"sliding_attention",
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"sliding_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"sliding_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"sliding_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"sliding_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"sliding_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"sliding_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"sliding_attention",
|
| 46 |
+
"full_attention",
|
| 47 |
+
"sliding_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"sliding_attention",
|
| 50 |
+
"full_attention",
|
| 51 |
+
"sliding_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention",
|
| 54 |
+
"full_attention",
|
| 55 |
+
"sliding_attention",
|
| 56 |
+
"sliding_attention",
|
| 57 |
+
"sliding_attention",
|
| 58 |
+
"full_attention",
|
| 59 |
+
"sliding_attention",
|
| 60 |
+
"sliding_attention",
|
| 61 |
+
"sliding_attention",
|
| 62 |
+
"full_attention",
|
| 63 |
+
"sliding_attention",
|
| 64 |
+
"sliding_attention",
|
| 65 |
+
"sliding_attention",
|
| 66 |
+
"full_attention",
|
| 67 |
+
"sliding_attention",
|
| 68 |
+
"sliding_attention",
|
| 69 |
+
"sliding_attention",
|
| 70 |
+
"full_attention",
|
| 71 |
+
"sliding_attention",
|
| 72 |
+
"sliding_attention",
|
| 73 |
+
"sliding_attention",
|
| 74 |
+
"full_attention",
|
| 75 |
+
"sliding_attention",
|
| 76 |
+
"sliding_attention",
|
| 77 |
+
"sliding_attention",
|
| 78 |
+
"full_attention"
|
| 79 |
+
],
|
| 80 |
+
"load_balance_coeff": 5e-05,
|
| 81 |
+
"max_position_embeddings": 262144,
|
| 82 |
+
"model_type": "afmoe",
|
| 83 |
+
"moe_intermediate_size": 3072,
|
| 84 |
+
"mup_enabled": true,
|
| 85 |
+
"n_group": 1,
|
| 86 |
+
"num_attention_heads": 48,
|
| 87 |
+
"num_dense_layers": 6,
|
| 88 |
+
"num_expert_groups": 1,
|
| 89 |
+
"num_experts": 256,
|
| 90 |
+
"num_experts_per_tok": 4,
|
| 91 |
+
"num_hidden_layers": 60,
|
| 92 |
+
"num_key_value_heads": 8,
|
| 93 |
+
"num_limited_groups": 1,
|
| 94 |
+
"num_shared_experts": 1,
|
| 95 |
+
"quantization_config": {
|
| 96 |
+
"config_groups": {
|
| 97 |
+
"group_0": {
|
| 98 |
+
"format": "float-quantized",
|
| 99 |
+
"input_activations": {
|
| 100 |
+
"actorder": null,
|
| 101 |
+
"block_structure": null,
|
| 102 |
+
"dynamic": true,
|
| 103 |
+
"group_size": 128,
|
| 104 |
+
"num_bits": 8,
|
| 105 |
+
"observer": null,
|
| 106 |
+
"observer_kwargs": {},
|
| 107 |
+
"scale_dtype": null,
|
| 108 |
+
"strategy": "group",
|
| 109 |
+
"symmetric": true,
|
| 110 |
+
"type": "float",
|
| 111 |
+
"zp_dtype": null
|
| 112 |
+
},
|
| 113 |
+
"output_activations": null,
|
| 114 |
+
"targets": [
|
| 115 |
+
"Linear"
|
| 116 |
+
],
|
| 117 |
+
"weights": {
|
| 118 |
+
"actorder": null,
|
| 119 |
+
"block_structure": [
|
| 120 |
+
128,
|
| 121 |
+
128
|
| 122 |
+
],
|
| 123 |
+
"dynamic": false,
|
| 124 |
+
"group_size": null,
|
| 125 |
+
"num_bits": 8,
|
| 126 |
+
"observer": "memoryless_minmax",
|
| 127 |
+
"observer_kwargs": {},
|
| 128 |
+
"scale_dtype": null,
|
| 129 |
+
"strategy": "block",
|
| 130 |
+
"symmetric": true,
|
| 131 |
+
"type": "float",
|
| 132 |
+
"zp_dtype": null
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
+
},
|
| 136 |
+
"format": "float-quantized",
|
| 137 |
+
"global_compression_ratio": null,
|
| 138 |
+
"ignore": [
|
| 139 |
+
"model.layers.0.self_attn.q_proj",
|
| 140 |
+
"model.layers.0.self_attn.k_proj",
|
| 141 |
+
"model.layers.0.self_attn.v_proj",
|
| 142 |
+
"model.layers.0.self_attn.o_proj",
|
| 143 |
+
"model.layers.0.self_attn.gate_proj",
|
| 144 |
+
"model.layers.1.self_attn.q_proj",
|
| 145 |
+
"model.layers.1.self_attn.k_proj",
|
| 146 |
+
"model.layers.1.self_attn.v_proj",
|
| 147 |
+
"model.layers.1.self_attn.o_proj",
|
| 148 |
+
"model.layers.1.self_attn.gate_proj",
|
| 149 |
+
"model.layers.2.self_attn.q_proj",
|
| 150 |
+
"model.layers.2.self_attn.k_proj",
|
| 151 |
+
"model.layers.2.self_attn.v_proj",
|
| 152 |
+
"model.layers.2.self_attn.o_proj",
|
| 153 |
+
"model.layers.2.self_attn.gate_proj",
|
| 154 |
+
"model.layers.3.self_attn.q_proj",
|
| 155 |
+
"model.layers.3.self_attn.k_proj",
|
| 156 |
+
"model.layers.3.self_attn.v_proj",
|
| 157 |
+
"model.layers.3.self_attn.o_proj",
|
| 158 |
+
"model.layers.3.self_attn.gate_proj",
|
| 159 |
+
"model.layers.4.self_attn.q_proj",
|
| 160 |
+
"model.layers.4.self_attn.k_proj",
|
| 161 |
+
"model.layers.4.self_attn.v_proj",
|
| 162 |
+
"model.layers.4.self_attn.o_proj",
|
| 163 |
+
"model.layers.4.self_attn.gate_proj",
|
| 164 |
+
"model.layers.5.self_attn.q_proj",
|
| 165 |
+
"model.layers.5.self_attn.k_proj",
|
| 166 |
+
"model.layers.5.self_attn.v_proj",
|
| 167 |
+
"model.layers.5.self_attn.o_proj",
|
| 168 |
+
"model.layers.5.self_attn.gate_proj",
|
| 169 |
+
"model.layers.6.self_attn.q_proj",
|
| 170 |
+
"model.layers.6.self_attn.k_proj",
|
| 171 |
+
"model.layers.6.self_attn.v_proj",
|
| 172 |
+
"model.layers.6.self_attn.o_proj",
|
| 173 |
+
"model.layers.6.self_attn.gate_proj",
|
| 174 |
+
"model.layers.6.mlp.router.gate",
|
| 175 |
+
"model.layers.7.self_attn.q_proj",
|
| 176 |
+
"model.layers.7.self_attn.k_proj",
|
| 177 |
+
"model.layers.7.self_attn.v_proj",
|
| 178 |
+
"model.layers.7.self_attn.o_proj",
|
| 179 |
+
"model.layers.7.self_attn.gate_proj",
|
| 180 |
+
"model.layers.7.mlp.router.gate",
|
| 181 |
+
"model.layers.8.self_attn.q_proj",
|
| 182 |
+
"model.layers.8.self_attn.k_proj",
|
| 183 |
+
"model.layers.8.self_attn.v_proj",
|
| 184 |
+
"model.layers.8.self_attn.o_proj",
|
| 185 |
+
"model.layers.8.self_attn.gate_proj",
|
| 186 |
+
"model.layers.8.mlp.router.gate",
|
| 187 |
+
"model.layers.9.self_attn.q_proj",
|
| 188 |
+
"model.layers.9.self_attn.k_proj",
|
| 189 |
+
"model.layers.9.self_attn.v_proj",
|
| 190 |
+
"model.layers.9.self_attn.o_proj",
|
| 191 |
+
"model.layers.9.self_attn.gate_proj",
|
| 192 |
+
"model.layers.9.mlp.router.gate",
|
| 193 |
+
"model.layers.10.self_attn.q_proj",
|
| 194 |
+
"model.layers.10.self_attn.k_proj",
|
| 195 |
+
"model.layers.10.self_attn.v_proj",
|
| 196 |
+
"model.layers.10.self_attn.o_proj",
|
| 197 |
+
"model.layers.10.self_attn.gate_proj",
|
| 198 |
+
"model.layers.10.mlp.router.gate",
|
| 199 |
+
"model.layers.11.self_attn.q_proj",
|
| 200 |
+
"model.layers.11.self_attn.k_proj",
|
| 201 |
+
"model.layers.11.self_attn.v_proj",
|
| 202 |
+
"model.layers.11.self_attn.o_proj",
|
| 203 |
+
"model.layers.11.self_attn.gate_proj",
|
| 204 |
+
"model.layers.11.mlp.router.gate",
|
| 205 |
+
"model.layers.12.self_attn.q_proj",
|
| 206 |
+
"model.layers.12.self_attn.k_proj",
|
| 207 |
+
"model.layers.12.self_attn.v_proj",
|
| 208 |
+
"model.layers.12.self_attn.o_proj",
|
| 209 |
+
"model.layers.12.self_attn.gate_proj",
|
| 210 |
+
"model.layers.12.mlp.router.gate",
|
| 211 |
+
"model.layers.13.self_attn.q_proj",
|
| 212 |
+
"model.layers.13.self_attn.k_proj",
|
| 213 |
+
"model.layers.13.self_attn.v_proj",
|
| 214 |
+
"model.layers.13.self_attn.o_proj",
|
| 215 |
+
"model.layers.13.self_attn.gate_proj",
|
| 216 |
+
"model.layers.13.mlp.router.gate",
|
| 217 |
+
"model.layers.14.self_attn.q_proj",
|
| 218 |
+
"model.layers.14.self_attn.k_proj",
|
| 219 |
+
"model.layers.14.self_attn.v_proj",
|
| 220 |
+
"model.layers.14.self_attn.o_proj",
|
| 221 |
+
"model.layers.14.self_attn.gate_proj",
|
| 222 |
+
"model.layers.14.mlp.router.gate",
|
| 223 |
+
"model.layers.15.self_attn.q_proj",
|
| 224 |
+
"model.layers.15.self_attn.k_proj",
|
| 225 |
+
"model.layers.15.self_attn.v_proj",
|
| 226 |
+
"model.layers.15.self_attn.o_proj",
|
| 227 |
+
"model.layers.15.self_attn.gate_proj",
|
| 228 |
+
"model.layers.15.mlp.router.gate",
|
| 229 |
+
"model.layers.16.self_attn.q_proj",
|
| 230 |
+
"model.layers.16.self_attn.k_proj",
|
| 231 |
+
"model.layers.16.self_attn.v_proj",
|
| 232 |
+
"model.layers.16.self_attn.o_proj",
|
| 233 |
+
"model.layers.16.self_attn.gate_proj",
|
| 234 |
+
"model.layers.16.mlp.router.gate",
|
| 235 |
+
"model.layers.17.self_attn.q_proj",
|
| 236 |
+
"model.layers.17.self_attn.k_proj",
|
| 237 |
+
"model.layers.17.self_attn.v_proj",
|
| 238 |
+
"model.layers.17.self_attn.o_proj",
|
| 239 |
+
"model.layers.17.self_attn.gate_proj",
|
| 240 |
+
"model.layers.17.mlp.router.gate",
|
| 241 |
+
"model.layers.18.self_attn.q_proj",
|
| 242 |
+
"model.layers.18.self_attn.k_proj",
|
| 243 |
+
"model.layers.18.self_attn.v_proj",
|
| 244 |
+
"model.layers.18.self_attn.o_proj",
|
| 245 |
+
"model.layers.18.self_attn.gate_proj",
|
| 246 |
+
"model.layers.18.mlp.router.gate",
|
| 247 |
+
"model.layers.19.self_attn.q_proj",
|
| 248 |
+
"model.layers.19.self_attn.k_proj",
|
| 249 |
+
"model.layers.19.self_attn.v_proj",
|
| 250 |
+
"model.layers.19.self_attn.o_proj",
|
| 251 |
+
"model.layers.19.self_attn.gate_proj",
|
| 252 |
+
"model.layers.19.mlp.router.gate",
|
| 253 |
+
"model.layers.20.self_attn.q_proj",
|
| 254 |
+
"model.layers.20.self_attn.k_proj",
|
| 255 |
+
"model.layers.20.self_attn.v_proj",
|
| 256 |
+
"model.layers.20.self_attn.o_proj",
|
| 257 |
+
"model.layers.20.self_attn.gate_proj",
|
| 258 |
+
"model.layers.20.mlp.router.gate",
|
| 259 |
+
"model.layers.21.self_attn.q_proj",
|
| 260 |
+
"model.layers.21.self_attn.k_proj",
|
| 261 |
+
"model.layers.21.self_attn.v_proj",
|
| 262 |
+
"model.layers.21.self_attn.o_proj",
|
| 263 |
+
"model.layers.21.self_attn.gate_proj",
|
| 264 |
+
"model.layers.21.mlp.router.gate",
|
| 265 |
+
"model.layers.22.self_attn.q_proj",
|
| 266 |
+
"model.layers.22.self_attn.k_proj",
|
| 267 |
+
"model.layers.22.self_attn.v_proj",
|
| 268 |
+
"model.layers.22.self_attn.o_proj",
|
| 269 |
+
"model.layers.22.self_attn.gate_proj",
|
| 270 |
+
"model.layers.22.mlp.router.gate",
|
| 271 |
+
"model.layers.23.self_attn.q_proj",
|
| 272 |
+
"model.layers.23.self_attn.k_proj",
|
| 273 |
+
"model.layers.23.self_attn.v_proj",
|
| 274 |
+
"model.layers.23.self_attn.o_proj",
|
| 275 |
+
"model.layers.23.self_attn.gate_proj",
|
| 276 |
+
"model.layers.23.mlp.router.gate",
|
| 277 |
+
"model.layers.24.self_attn.q_proj",
|
| 278 |
+
"model.layers.24.self_attn.k_proj",
|
| 279 |
+
"model.layers.24.self_attn.v_proj",
|
| 280 |
+
"model.layers.24.self_attn.o_proj",
|
| 281 |
+
"model.layers.24.self_attn.gate_proj",
|
| 282 |
+
"model.layers.24.mlp.router.gate",
|
| 283 |
+
"model.layers.25.self_attn.q_proj",
|
| 284 |
+
"model.layers.25.self_attn.k_proj",
|
| 285 |
+
"model.layers.25.self_attn.v_proj",
|
| 286 |
+
"model.layers.25.self_attn.o_proj",
|
| 287 |
+
"model.layers.25.self_attn.gate_proj",
|
| 288 |
+
"model.layers.25.mlp.router.gate",
|
| 289 |
+
"model.layers.26.self_attn.q_proj",
|
| 290 |
+
"model.layers.26.self_attn.k_proj",
|
| 291 |
+
"model.layers.26.self_attn.v_proj",
|
| 292 |
+
"model.layers.26.self_attn.o_proj",
|
| 293 |
+
"model.layers.26.self_attn.gate_proj",
|
| 294 |
+
"model.layers.26.mlp.router.gate",
|
| 295 |
+
"model.layers.27.self_attn.q_proj",
|
| 296 |
+
"model.layers.27.self_attn.k_proj",
|
| 297 |
+
"model.layers.27.self_attn.v_proj",
|
| 298 |
+
"model.layers.27.self_attn.o_proj",
|
| 299 |
+
"model.layers.27.self_attn.gate_proj",
|
| 300 |
+
"model.layers.27.mlp.router.gate",
|
| 301 |
+
"model.layers.28.self_attn.q_proj",
|
| 302 |
+
"model.layers.28.self_attn.k_proj",
|
| 303 |
+
"model.layers.28.self_attn.v_proj",
|
| 304 |
+
"model.layers.28.self_attn.o_proj",
|
| 305 |
+
"model.layers.28.self_attn.gate_proj",
|
| 306 |
+
"model.layers.28.mlp.router.gate",
|
| 307 |
+
"model.layers.29.self_attn.q_proj",
|
| 308 |
+
"model.layers.29.self_attn.k_proj",
|
| 309 |
+
"model.layers.29.self_attn.v_proj",
|
| 310 |
+
"model.layers.29.self_attn.o_proj",
|
| 311 |
+
"model.layers.29.self_attn.gate_proj",
|
| 312 |
+
"model.layers.29.mlp.router.gate",
|
| 313 |
+
"model.layers.30.self_attn.q_proj",
|
| 314 |
+
"model.layers.30.self_attn.k_proj",
|
| 315 |
+
"model.layers.30.self_attn.v_proj",
|
| 316 |
+
"model.layers.30.self_attn.o_proj",
|
| 317 |
+
"model.layers.30.self_attn.gate_proj",
|
| 318 |
+
"model.layers.30.mlp.router.gate",
|
| 319 |
+
"model.layers.31.self_attn.q_proj",
|
| 320 |
+
"model.layers.31.self_attn.k_proj",
|
| 321 |
+
"model.layers.31.self_attn.v_proj",
|
| 322 |
+
"model.layers.31.self_attn.o_proj",
|
| 323 |
+
"model.layers.31.self_attn.gate_proj",
|
| 324 |
+
"model.layers.31.mlp.router.gate",
|
| 325 |
+
"model.layers.32.self_attn.q_proj",
|
| 326 |
+
"model.layers.32.self_attn.k_proj",
|
| 327 |
+
"model.layers.32.self_attn.v_proj",
|
| 328 |
+
"model.layers.32.self_attn.o_proj",
|
| 329 |
+
"model.layers.32.self_attn.gate_proj",
|
| 330 |
+
"model.layers.32.mlp.router.gate",
|
| 331 |
+
"model.layers.33.self_attn.q_proj",
|
| 332 |
+
"model.layers.33.self_attn.k_proj",
|
| 333 |
+
"model.layers.33.self_attn.v_proj",
|
| 334 |
+
"model.layers.33.self_attn.o_proj",
|
| 335 |
+
"model.layers.33.self_attn.gate_proj",
|
| 336 |
+
"model.layers.33.mlp.router.gate",
|
| 337 |
+
"model.layers.34.self_attn.q_proj",
|
| 338 |
+
"model.layers.34.self_attn.k_proj",
|
| 339 |
+
"model.layers.34.self_attn.v_proj",
|
| 340 |
+
"model.layers.34.self_attn.o_proj",
|
| 341 |
+
"model.layers.34.self_attn.gate_proj",
|
| 342 |
+
"model.layers.34.mlp.router.gate",
|
| 343 |
+
"model.layers.35.self_attn.q_proj",
|
| 344 |
+
"model.layers.35.self_attn.k_proj",
|
| 345 |
+
"model.layers.35.self_attn.v_proj",
|
| 346 |
+
"model.layers.35.self_attn.o_proj",
|
| 347 |
+
"model.layers.35.self_attn.gate_proj",
|
| 348 |
+
"model.layers.35.mlp.router.gate",
|
| 349 |
+
"model.layers.36.self_attn.q_proj",
|
| 350 |
+
"model.layers.36.self_attn.k_proj",
|
| 351 |
+
"model.layers.36.self_attn.v_proj",
|
| 352 |
+
"model.layers.36.self_attn.o_proj",
|
| 353 |
+
"model.layers.36.self_attn.gate_proj",
|
| 354 |
+
"model.layers.36.mlp.router.gate",
|
| 355 |
+
"model.layers.37.self_attn.q_proj",
|
| 356 |
+
"model.layers.37.self_attn.k_proj",
|
| 357 |
+
"model.layers.37.self_attn.v_proj",
|
| 358 |
+
"model.layers.37.self_attn.o_proj",
|
| 359 |
+
"model.layers.37.self_attn.gate_proj",
|
| 360 |
+
"model.layers.37.mlp.router.gate",
|
| 361 |
+
"model.layers.38.self_attn.q_proj",
|
| 362 |
+
"model.layers.38.self_attn.k_proj",
|
| 363 |
+
"model.layers.38.self_attn.v_proj",
|
| 364 |
+
"model.layers.38.self_attn.o_proj",
|
| 365 |
+
"model.layers.38.self_attn.gate_proj",
|
| 366 |
+
"model.layers.38.mlp.router.gate",
|
| 367 |
+
"model.layers.39.self_attn.q_proj",
|
| 368 |
+
"model.layers.39.self_attn.k_proj",
|
| 369 |
+
"model.layers.39.self_attn.v_proj",
|
| 370 |
+
"model.layers.39.self_attn.o_proj",
|
| 371 |
+
"model.layers.39.self_attn.gate_proj",
|
| 372 |
+
"model.layers.39.mlp.router.gate",
|
| 373 |
+
"model.layers.40.self_attn.q_proj",
|
| 374 |
+
"model.layers.40.self_attn.k_proj",
|
| 375 |
+
"model.layers.40.self_attn.v_proj",
|
| 376 |
+
"model.layers.40.self_attn.o_proj",
|
| 377 |
+
"model.layers.40.self_attn.gate_proj",
|
| 378 |
+
"model.layers.40.mlp.router.gate",
|
| 379 |
+
"model.layers.41.self_attn.q_proj",
|
| 380 |
+
"model.layers.41.self_attn.k_proj",
|
| 381 |
+
"model.layers.41.self_attn.v_proj",
|
| 382 |
+
"model.layers.41.self_attn.o_proj",
|
| 383 |
+
"model.layers.41.self_attn.gate_proj",
|
| 384 |
+
"model.layers.41.mlp.router.gate",
|
| 385 |
+
"model.layers.42.self_attn.q_proj",
|
| 386 |
+
"model.layers.42.self_attn.k_proj",
|
| 387 |
+
"model.layers.42.self_attn.v_proj",
|
| 388 |
+
"model.layers.42.self_attn.o_proj",
|
| 389 |
+
"model.layers.42.self_attn.gate_proj",
|
| 390 |
+
"model.layers.42.mlp.router.gate",
|
| 391 |
+
"model.layers.43.self_attn.q_proj",
|
| 392 |
+
"model.layers.43.self_attn.k_proj",
|
| 393 |
+
"model.layers.43.self_attn.v_proj",
|
| 394 |
+
"model.layers.43.self_attn.o_proj",
|
| 395 |
+
"model.layers.43.self_attn.gate_proj",
|
| 396 |
+
"model.layers.43.mlp.router.gate",
|
| 397 |
+
"model.layers.44.self_attn.q_proj",
|
| 398 |
+
"model.layers.44.self_attn.k_proj",
|
| 399 |
+
"model.layers.44.self_attn.v_proj",
|
| 400 |
+
"model.layers.44.self_attn.o_proj",
|
| 401 |
+
"model.layers.44.self_attn.gate_proj",
|
| 402 |
+
"model.layers.44.mlp.router.gate",
|
| 403 |
+
"model.layers.45.self_attn.q_proj",
|
| 404 |
+
"model.layers.45.self_attn.k_proj",
|
| 405 |
+
"model.layers.45.self_attn.v_proj",
|
| 406 |
+
"model.layers.45.self_attn.o_proj",
|
| 407 |
+
"model.layers.45.self_attn.gate_proj",
|
| 408 |
+
"model.layers.45.mlp.router.gate",
|
| 409 |
+
"model.layers.46.self_attn.q_proj",
|
| 410 |
+
"model.layers.46.self_attn.k_proj",
|
| 411 |
+
"model.layers.46.self_attn.v_proj",
|
| 412 |
+
"model.layers.46.self_attn.o_proj",
|
| 413 |
+
"model.layers.46.self_attn.gate_proj",
|
| 414 |
+
"model.layers.46.mlp.router.gate",
|
| 415 |
+
"model.layers.47.self_attn.q_proj",
|
| 416 |
+
"model.layers.47.self_attn.k_proj",
|
| 417 |
+
"model.layers.47.self_attn.v_proj",
|
| 418 |
+
"model.layers.47.self_attn.o_proj",
|
| 419 |
+
"model.layers.47.self_attn.gate_proj",
|
| 420 |
+
"model.layers.47.mlp.router.gate",
|
| 421 |
+
"model.layers.48.self_attn.q_proj",
|
| 422 |
+
"model.layers.48.self_attn.k_proj",
|
| 423 |
+
"model.layers.48.self_attn.v_proj",
|
| 424 |
+
"model.layers.48.self_attn.o_proj",
|
| 425 |
+
"model.layers.48.self_attn.gate_proj",
|
| 426 |
+
"model.layers.48.mlp.router.gate",
|
| 427 |
+
"model.layers.49.self_attn.q_proj",
|
| 428 |
+
"model.layers.49.self_attn.k_proj",
|
| 429 |
+
"model.layers.49.self_attn.v_proj",
|
| 430 |
+
"model.layers.49.self_attn.o_proj",
|
| 431 |
+
"model.layers.49.self_attn.gate_proj",
|
| 432 |
+
"model.layers.49.mlp.router.gate",
|
| 433 |
+
"model.layers.50.self_attn.q_proj",
|
| 434 |
+
"model.layers.50.self_attn.k_proj",
|
| 435 |
+
"model.layers.50.self_attn.v_proj",
|
| 436 |
+
"model.layers.50.self_attn.o_proj",
|
| 437 |
+
"model.layers.50.self_attn.gate_proj",
|
| 438 |
+
"model.layers.50.mlp.router.gate",
|
| 439 |
+
"model.layers.51.self_attn.q_proj",
|
| 440 |
+
"model.layers.51.self_attn.k_proj",
|
| 441 |
+
"model.layers.51.self_attn.v_proj",
|
| 442 |
+
"model.layers.51.self_attn.o_proj",
|
| 443 |
+
"model.layers.51.self_attn.gate_proj",
|
| 444 |
+
"model.layers.51.mlp.router.gate",
|
| 445 |
+
"model.layers.52.self_attn.q_proj",
|
| 446 |
+
"model.layers.52.self_attn.k_proj",
|
| 447 |
+
"model.layers.52.self_attn.v_proj",
|
| 448 |
+
"model.layers.52.self_attn.o_proj",
|
| 449 |
+
"model.layers.52.self_attn.gate_proj",
|
| 450 |
+
"model.layers.52.mlp.router.gate",
|
| 451 |
+
"model.layers.53.self_attn.q_proj",
|
| 452 |
+
"model.layers.53.self_attn.k_proj",
|
| 453 |
+
"model.layers.53.self_attn.v_proj",
|
| 454 |
+
"model.layers.53.self_attn.o_proj",
|
| 455 |
+
"model.layers.53.self_attn.gate_proj",
|
| 456 |
+
"model.layers.53.mlp.router.gate",
|
| 457 |
+
"model.layers.54.self_attn.q_proj",
|
| 458 |
+
"model.layers.54.self_attn.k_proj",
|
| 459 |
+
"model.layers.54.self_attn.v_proj",
|
| 460 |
+
"model.layers.54.self_attn.o_proj",
|
| 461 |
+
"model.layers.54.self_attn.gate_proj",
|
| 462 |
+
"model.layers.54.mlp.router.gate",
|
| 463 |
+
"model.layers.55.self_attn.q_proj",
|
| 464 |
+
"model.layers.55.self_attn.k_proj",
|
| 465 |
+
"model.layers.55.self_attn.v_proj",
|
| 466 |
+
"model.layers.55.self_attn.o_proj",
|
| 467 |
+
"model.layers.55.self_attn.gate_proj",
|
| 468 |
+
"model.layers.55.mlp.router.gate",
|
| 469 |
+
"model.layers.56.self_attn.q_proj",
|
| 470 |
+
"model.layers.56.self_attn.k_proj",
|
| 471 |
+
"model.layers.56.self_attn.v_proj",
|
| 472 |
+
"model.layers.56.self_attn.o_proj",
|
| 473 |
+
"model.layers.56.self_attn.gate_proj",
|
| 474 |
+
"model.layers.56.mlp.router.gate",
|
| 475 |
+
"model.layers.57.self_attn.q_proj",
|
| 476 |
+
"model.layers.57.self_attn.k_proj",
|
| 477 |
+
"model.layers.57.self_attn.v_proj",
|
| 478 |
+
"model.layers.57.self_attn.o_proj",
|
| 479 |
+
"model.layers.57.self_attn.gate_proj",
|
| 480 |
+
"model.layers.57.mlp.router.gate",
|
| 481 |
+
"model.layers.58.self_attn.q_proj",
|
| 482 |
+
"model.layers.58.self_attn.k_proj",
|
| 483 |
+
"model.layers.58.self_attn.v_proj",
|
| 484 |
+
"model.layers.58.self_attn.o_proj",
|
| 485 |
+
"model.layers.58.self_attn.gate_proj",
|
| 486 |
+
"model.layers.58.mlp.router.gate",
|
| 487 |
+
"model.layers.59.self_attn.q_proj",
|
| 488 |
+
"model.layers.59.self_attn.k_proj",
|
| 489 |
+
"model.layers.59.self_attn.v_proj",
|
| 490 |
+
"model.layers.59.self_attn.o_proj",
|
| 491 |
+
"model.layers.59.self_attn.gate_proj",
|
| 492 |
+
"model.layers.59.mlp.router.gate",
|
| 493 |
+
"lm_head"
|
| 494 |
+
],
|
| 495 |
+
"kv_cache_scheme": null,
|
| 496 |
+
"quant_method": "compressed-tensors",
|
| 497 |
+
"quantization_status": "compressed",
|
| 498 |
+
"sparsity_config": {},
|
| 499 |
+
"transform_config": {},
|
| 500 |
+
"version": "0.14.0"
|
| 501 |
+
},
|
| 502 |
+
"rms_norm_eps": 1e-05,
|
| 503 |
+
"rope_scaling": null,
|
| 504 |
+
"rope_theta": 10000,
|
| 505 |
+
"route_norm": true,
|
| 506 |
+
"route_scale": 2.448,
|
| 507 |
+
"score_func": "sigmoid",
|
| 508 |
+
"sliding_window": 4096,
|
| 509 |
+
"tie_word_embeddings": false,
|
| 510 |
+
"topk_group": 1,
|
| 511 |
+
"transformers_version": "4.57.6",
|
| 512 |
+
"use_cache": true,
|
| 513 |
+
"use_grouped_mm": true,
|
| 514 |
+
"vocab_size": 200192
|
| 515 |
+
}
|
configuration_afmoe.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 16 |
+
from transformers.modeling_rope_utils import rope_config_validation
|
| 17 |
+
from transformers.configuration_utils import layer_type_validation
|
| 18 |
+
from transformers.utils import logging
|
| 19 |
+
|
| 20 |
+
logger = logging.get_logger(__name__)
|
| 21 |
+
|
| 22 |
+
class AfmoeConfig(PretrainedConfig):
|
| 23 |
+
"""
|
| 24 |
+
n_group (`int`, *optional*, defaults to 1):
|
| 25 |
+
Number of groups for routed experts.
|
| 26 |
+
topk_group (`int`, *optional*, defaults to 1):
|
| 27 |
+
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
| 28 |
+
"""
|
| 29 |
+
model_type = "afmoe"
|
| 30 |
+
base_model_pp_plan = {
|
| 31 |
+
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
| 32 |
+
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
| 33 |
+
"norm": (["hidden_states"], ["hidden_states"]),
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
def __init__(
|
| 37 |
+
self,
|
| 38 |
+
num_hidden_layers: int = 32,
|
| 39 |
+
vocab_size: int = 200192,
|
| 40 |
+
hidden_size: int = 2048,
|
| 41 |
+
intermediate_size: int = 6144,
|
| 42 |
+
moe_intermediate_size=1408,
|
| 43 |
+
num_dense_layers=1,
|
| 44 |
+
num_attention_heads=16,
|
| 45 |
+
num_key_value_heads=None,
|
| 46 |
+
head_dim=128,
|
| 47 |
+
hidden_act="silu",
|
| 48 |
+
max_position_embeddings=16384,
|
| 49 |
+
initializer_range=0.02,
|
| 50 |
+
rms_norm_eps=1e-5,
|
| 51 |
+
use_cache=True,
|
| 52 |
+
tie_word_embeddings=False,
|
| 53 |
+
rope_theta=10000.0,
|
| 54 |
+
rope_scaling=None,
|
| 55 |
+
num_experts=64,
|
| 56 |
+
num_experts_per_tok=6,
|
| 57 |
+
num_shared_experts=2,
|
| 58 |
+
num_expert_groups=1,
|
| 59 |
+
num_limited_groups=1,
|
| 60 |
+
score_func="sigmoid",
|
| 61 |
+
route_norm=True,
|
| 62 |
+
route_scale=1.0,
|
| 63 |
+
global_attn_every_n_layers=4,
|
| 64 |
+
sliding_window=1024,
|
| 65 |
+
mup_enabled=False,
|
| 66 |
+
layer_types=None,
|
| 67 |
+
attention_dropout: float = 0.0,
|
| 68 |
+
n_group: int = 1,
|
| 69 |
+
topk_group: int = 1,
|
| 70 |
+
**kwargs,
|
| 71 |
+
):
|
| 72 |
+
self.vocab_size = vocab_size
|
| 73 |
+
self.max_position_embeddings = max_position_embeddings
|
| 74 |
+
self.hidden_size = hidden_size
|
| 75 |
+
self.intermediate_size = intermediate_size
|
| 76 |
+
self.num_hidden_layers = num_hidden_layers
|
| 77 |
+
self.num_dense_layers = num_dense_layers
|
| 78 |
+
self.num_attention_heads = num_attention_heads
|
| 79 |
+
self.head_dim = head_dim
|
| 80 |
+
self.hidden_act = hidden_act
|
| 81 |
+
self.initializer_range = initializer_range
|
| 82 |
+
self.rms_norm_eps = rms_norm_eps
|
| 83 |
+
self.use_cache = use_cache
|
| 84 |
+
self.rope_theta = rope_theta
|
| 85 |
+
self.rope_scaling = rope_scaling
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# MoE specific
|
| 89 |
+
self.moe_intermediate_size = moe_intermediate_size
|
| 90 |
+
self.num_experts_per_tok = num_experts_per_tok
|
| 91 |
+
self.n_group = n_group
|
| 92 |
+
self.topk_group = topk_group
|
| 93 |
+
self.num_experts = num_experts
|
| 94 |
+
self.num_shared_experts = num_shared_experts
|
| 95 |
+
self.num_expert_groups = num_expert_groups
|
| 96 |
+
self.num_limited_groups = num_limited_groups
|
| 97 |
+
self.score_func = score_func
|
| 98 |
+
self.route_norm = route_norm
|
| 99 |
+
self.route_scale = route_scale
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# Attention specific
|
| 103 |
+
self.attention_dropout = attention_dropout
|
| 104 |
+
self.global_attn_every_n_layers = global_attn_every_n_layers
|
| 105 |
+
self.sliding_window = sliding_window
|
| 106 |
+
self.layer_types = layer_types
|
| 107 |
+
if self.layer_types is None:
|
| 108 |
+
self.layer_types = [
|
| 109 |
+
"sliding_attention" if bool((i + 1) % global_attn_every_n_layers) else "full_attention" for i in range(self.num_hidden_layers)
|
| 110 |
+
]
|
| 111 |
+
layer_type_validation(self.layer_types)
|
| 112 |
+
|
| 113 |
+
# muP specific
|
| 114 |
+
self.mup_enabled = mup_enabled
|
| 115 |
+
|
| 116 |
+
if num_key_value_heads is None:
|
| 117 |
+
num_key_value_heads = num_attention_heads
|
| 118 |
+
|
| 119 |
+
self.num_key_value_heads = num_key_value_heads
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Validate rope configs
|
| 123 |
+
if self.rope_scaling is not None and "type" in self.rope_scaling:
|
| 124 |
+
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
|
| 125 |
+
rope_config_validation(self)
|
| 126 |
+
|
| 127 |
+
super().__init__(
|
| 128 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 129 |
+
**kwargs,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
__all__ = ["AfmoeConfig"]
|
generation_config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 0,
|
| 4 |
+
"do_sample": true,
|
| 5 |
+
"eos_token_id": 3,
|
| 6 |
+
"pad_token_id": 12,
|
| 7 |
+
"temperature": 0.8,
|
| 8 |
+
"top_p": 0.8,
|
| 9 |
+
"transformers_version": "4.57.6"
|
| 10 |
+
}
|
model-00001-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17d8032fafe6ea1bbc1a3965138b55f922b6408bc488fd2c27c478e46b5df905
|
| 3 |
+
size 4991297952
|
model-00002-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39ec8ac900fe6451abaa9d2780dd9e743974515cadf00e8f1304b39ab8b77dc0
|
| 3 |
+
size 4993010240
|
model-00003-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdc9bbc9d2455c73731ccc13921db2c71794e242f2aa39417e009d5b3814a36b
|
| 3 |
+
size 4997737720
|
model-00004-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:62c0c5b1b44912f21e2b5425f156077a088ab9216b8fa5dc7b4247d04a02a7d6
|
| 3 |
+
size 4997737816
|
model-00005-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4383e929baa767a1e9b8cc792d7cef0f1abc1a0084a9ac6a990e93a6250a0c14
|
| 3 |
+
size 4986706032
|
model-00006-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99121544dece1dd95fbe4fa4905545459ee41b40786aba22dbb636f8e5b3f6d7
|
| 3 |
+
size 4994603392
|
model-00007-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a40a85706d10934eb1675d0e0e4bd88ff08cd1e882d3908232cd3bc45799884
|
| 3 |
+
size 4997739824
|
model-00008-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a96a8307c1006fea6de61ed1138fdf16a5b08de7169edd857b171b93fe70cbe2
|
| 3 |
+
size 4997739304
|
model-00009-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3bd40717a9ec81cce1932357e865d129283dc32ddb9ac1e6b44dd8bb14a41f94
|
| 3 |
+
size 4993010824
|
model-00010-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aaba8ad2a72a27261b35ef0a9e50e5814e2398c964c6d97d0d03dc2d2ac4ba40
|
| 3 |
+
size 4997738792
|
model-00011-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b68dd29426c4fc0ebfc87a531d914d095ae819627bd7ede838399973db45722
|
| 3 |
+
size 4997739272
|
model-00012-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8a9feb4614755537a72b6322eed3c97a000ae1b93b087aeda7ab9b5727f11854
|
| 3 |
+
size 4993010896
|
model-00013-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ede719658d9234a7cb0d658db8012147b69a23b4bdf0a63ff1b63d4c3f814d4
|
| 3 |
+
size 4997738760
|
model-00014-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cdb3ad46440df9b3e463d33039025a166a8627536c12eeb7f740a23444dd46df
|
| 3 |
+
size 4997739264
|
model-00015-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75fad5dc16ce440814451503324a4133ac95a8622f9f31780c74bedbaf2d39c0
|
| 3 |
+
size 4993010936
|
model-00016-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c6bf34ab60420a9f66bdce6095e27a76ff416e0eb6af9e715c160a5f42027b0
|
| 3 |
+
size 4997738760
|
model-00017-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:162fea404b03d47710535c13fcd1d20ad3d58aa84479b0da6f65b65a763022a0
|
| 3 |
+
size 4997739224
|
model-00018-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b02f27369b041f7d0b12a2c0234bdb6e5b4c098308c9f19939f17b38ebfe1db5
|
| 3 |
+
size 4993010976
|
model-00019-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07a35f6ca6504457e972d90dd22902c6a0096767e3bdf37ccd018ab6cb73b563
|
| 3 |
+
size 4997738760
|
model-00020-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd803f7dd68083b8b3a565d54caa5189cd789456c2944d4c7ce1d5b15bc42c12
|
| 3 |
+
size 4997739184
|
model-00021-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a5178453852c3d2bbab8133727481e7b19bd1af69ed7a475892c250ae0679ea
|
| 3 |
+
size 4993011008
|
model-00022-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cadae97c99fd231eacc79239370348b217d57025d0fe097a9b2db3f805b3abdb
|
| 3 |
+
size 4997738760
|
model-00023-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39f5fbdd7fedfae928386b256af0fc836170a594842426e52694c9078ecc6017
|
| 3 |
+
size 4997739144
|
model-00024-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aaf5184aa4e40926044e5c557f1e2f626094a02f5e8ff2a78c8b910b47520b1f
|
| 3 |
+
size 4993011048
|
model-00025-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e581269653588c4164376bdc1333752b03e76aadb75dfaf98d11df6e4be1818d
|
| 3 |
+
size 4997738760
|
model-00026-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd20a36515acfa8db4377377839ba990a10151d4f5add0678889dce2ea04a423
|
| 3 |
+
size 4997739112
|
model-00027-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c80e9cf5599b8c2bcee50bf55063e87fd6f6119c0375d7ec9119ed63e766d231
|
| 3 |
+
size 4993011088
|
model-00028-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0c44491a88db4dba47b7484d856e1f6c747e574c71698ca0ac20e9a54ea0a2d
|
| 3 |
+
size 4997738760
|
model-00029-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2bef43f5d4d5dd21adf28358117d771ed045b8823377ae0e980955ca3bc9859e
|
| 3 |
+
size 4997739072
|
model-00030-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9cfb0626fa9f84397c4a407f42c11d6b87288e992cd96afde05aa2d7908a938a
|
| 3 |
+
size 4993011120
|
model-00031-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30935b28748c0063919074cefe1ded91bcd3d56b399418831cbde18ce96a11af
|
| 3 |
+
size 4997738760
|
model-00032-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d465e3f331560851ace35d861977e8feb7119261f2a920d172dc6006d6ba7e27
|
| 3 |
+
size 4997739032
|
model-00033-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7673aad9fdceb1adc0a78c4cf237e51bb743287fa9e863dbe7a7e4051510f88f
|
| 3 |
+
size 4993011160
|
model-00034-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47824d2fc6f13f2f031da0b62154fb2db120f77317a02059172b1069c3109844
|
| 3 |
+
size 4997738768
|
model-00035-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cadabe2c5219bbeb220fb0f792a5e8883c747087935179effb19d9d1f3375c3
|
| 3 |
+
size 4997738992
|
model-00036-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:013dcb25788a5cea80af5fbf571baae030f54ca0b12d4fa20af45d7f96d9b18c
|
| 3 |
+
size 4993011200
|
model-00037-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d50335896c003d70ed9e5b1d02a6631b49921ac2ef9976a31824b8867eaf2cfa
|
| 3 |
+
size 4997738768
|
model-00038-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d6acdce3dc22c7b71928378e0c0777e378282f0dc012d215127ee433de13740
|
| 3 |
+
size 4997738960
|
model-00039-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6da2e35531d607dd42f45ad79b3b85ada4e9b61f04a48c613e59ff8324ce9a1
|
| 3 |
+
size 4993011232
|
model-00040-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:acf7f08c463a6f6a52105671316fcece7b0fdde757ec4b3ea94bee9e03d5161f
|
| 3 |
+
size 4997738768
|
model-00041-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7e7ce29a81ce5fd7fe6e802a9358c9ab9f5eb158bf1467d5446be39c69ac83b
|
| 3 |
+
size 4997738920
|
model-00042-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a55b030a9c3f082fb853e8917ce33aa41de2f1b1521089eb768851693aa4acb2
|
| 3 |
+
size 4993011280
|
model-00043-of-00081.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b46484b4cd664c8da178e790b2cfbfbb5b82a5d42e38add0007c205d7a9dc3d8
|
| 3 |
+
size 4997738768
|