TeleAI-AI-Flow commited on
Commit
161e8e7
·
verified ·
1 Parent(s): 75ee892

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,54 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *.tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.db* filter=lfs diff=lfs merge=lfs -text
29
+ *.ark* filter=lfs diff=lfs merge=lfs -text
30
+ **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
31
+ **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
32
+ **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
33
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
34
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
35
+ *.gguf* filter=lfs diff=lfs merge=lfs -text
36
+ *.ggml filter=lfs diff=lfs merge=lfs -text
37
+ *.llamafile* filter=lfs diff=lfs merge=lfs -text
38
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
39
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
40
+ *.npy filter=lfs diff=lfs merge=lfs -text
41
+ *.npz filter=lfs diff=lfs merge=lfs -text
42
+ *.pickle filter=lfs diff=lfs merge=lfs -text
43
+ *.pkl filter=lfs diff=lfs merge=lfs -text
44
+ *.tar filter=lfs diff=lfs merge=lfs -text
45
+ *.wasm filter=lfs diff=lfs merge=lfs -text
46
+ *.zst filter=lfs diff=lfs merge=lfs -text
47
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
48
+ model-00001-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
49
+ model-00002-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
50
+ model-00003-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
51
+ model-00004-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
52
+ model-00005-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
53
+ tokenizer.json filter=lfs diff=lfs merge=lfs -textassets/AI-Flow-Ruyi-logo.png filter=lfs diff=lfs merge=lfs -text
54
+ assets/ai-flow.png filter=lfs diff=lfs merge=lfs -text
.msc ADDED
Binary file (1.77 kB). View file
 
.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1753426552
README.md CHANGED
@@ -1,3 +1,295 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - Pytorch
4
+ license: apache-2.0
5
+ tasks:
6
+ - text-generation
7
+ ---
8
+
9
+ # AI-Flow-Ruyi (如意大模型)
10
+
11
+ <p align="center">
12
+ <img src="assets/AI-Flow-Ruyi-logo.png" width="500" />
13
+ </p>
14
+
15
+ <p align="center">
16
+ <a href="README.md">中文</a> &nbsp | &nbsp <a href="README_en.md">English</a>
17
+ <br>
18
+ 🐱 <a href="https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://www.modelscope.cn/models/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704/">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑&nbsp <a href="https://www.arxiv.org/abs/2506.12479">Paper</a>
19
+ </p>
20
+
21
+ #### Long long ago...
22
+ > 龙宫中珍藏着一根神棒,能大能小,变化无穷。一日,龙王闲来无事,对着神棒感慨:“你有如此神通,若能助我龙族做些别的事该多好。”话音未落,神棒竟开口应道:“我倒有个主意,这变化之能,若用来帮世人解决难题...” 说干就干,神棒瞬间摇身一变,化作一个神通广大的“如意”大模型,能依据问题的难易,自由伸缩其“能耐”。龙王见状大喜:“这不正是能助人排忧解难的‘如意’宝贝吗?”遂为其赐名“如意”,派它前往人间济世助人。
23
+
24
+ ## 新闻
25
+
26
+ * 🎉🎉[2025/7/25]:如意-7B正式版(AI-Flow-Ruyi-7B)发布
27
+ * 🎉🎉[2025/7/14]:智传网(AI Flow)被国内知名科技媒体「[机器之心](https://mp.weixin.qq.com/s/fiyb3LyJOd5mr9xzAsDZ4A)」报道!
28
+ * 🎉🎉[2025/7/4]:智传网(AI Flow)被全球资讯机构[Omdia](https://omdia.tech.informa.com/om137892/on-the-radar-teleai-brings-intelligence-to-the-network-edge-through-ai-flow)纳入短评,列为生成式 AI 落地应用的“重点观察”。
29
+ * 🎉🎉[2025/7/4]:如意-7B预览版(AI-Flow-Ruyi-7B-Preview)发布
30
+
31
+ ## 介绍
32
+
33
+ **如意大模型(AI-Flow-Ruyi)** 是中国电信人工智能研究院 (TeleAI) 智传网(AI Flow)团队研发,是面向下一代“端-边-云”模型服务架构的**同源家族模型(Familial Model)** 。其核心在于大小模型共享同源参数,模型能基于早退出机制,根据问题复杂度调用不同参数规模的分支模型进行响应。各分支既可独立运行,又能依托同源特性实现信息共享与无缝切换,结合端-边-云分布式部署,完成家族大小模型协同,实现模型分布式推理效率大幅提升。
34
+
35
+ ![](assets/ai-flow.png)
36
+ ![](assets/ruyi_model.png)
37
+
38
+ ## 如意-7B
39
+
40
+ 为了让业界能亲身体验能够自由伸缩的“家族模型”,我们开源了如意-7B(AI-Flow-Ruyi-7B)模型,以展示我们在技术落地上的决心。如意-7B于7月25日发布。其最大参数量分支为7B,可分化出具有等效参数量为3B、4B、5B、6B的早退出分支。其中:
41
+ * 3B、4B分支聚焦简单对话场景,其优势在于响应速度快、资源需求低;
42
+ * 5B、6B分支则针对日常通用任务场景,在性能与响应速度之间寻求平衡;
43
+ * 7B分支主要用于应对复杂问题,在多种能力维度上展现出较为全面的特性,但相对而言响应速度稍缓、资源需求略高。
44
+
45
+ |位点序号|早退出位置|等效模型大小|对应分支代号|场景定位|
46
+ |:-:|:-:|:-:|:-:|:-:|
47
+ |1|11层|3B|AI-Flow-Ruyi-7B-E3B|简单对话|
48
+ |2|15层|4B|AI-Flow-Ruyi-7B-E4B|简单对话|
49
+ |3|19层|5B|AI-Flow-Ruyi-7B-E5B|日常任务|
50
+ |4|23层|6B|AI-Flow-Ruyi-7B-E6B|日常任务|
51
+ |5|27层|7B|AI-Flow-Ruyi-7B-E7B|复杂问题|
52
+
53
+ ### 训练过程
54
+
55
+ 在训练开始前,我们基于Qwen团队预训练的[Qwen2.5-7B](https://arxiv.org/abs/2412.15115)模型(其已在18万亿高质量token上完成预训练),对7B主分支进行了参数初始化;对于早退出分支,其解码器层采用早退出位置的下一层参数进行初始化。
56
+
57
+ 完成初始化后,我们采用**多分支联合预训练**方法,在私有高质量数据集上进行了约4000亿token的继续预训练,构建出如意-7B基座(AI-Flow-Ruyi-7B-Base)。
58
+
59
+ 随后,我们基于约70万条高质量指令数据,对各分支进行了**联合指令遵循微调**,得到如意-7B。
60
+
61
+ ### 性能评测
62
+
63
+ 我们基于[OpenCompass](https://github.com/open-compass/opencompass)及其官方配置文件,以0-shot方式在多个数据集上进行评测。
64
+
65
+ <details>
66
+ <summary>通用任务评测</summary>
67
+
68
+ |模型名称|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|均分|
69
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
70
+ |Qwen3-8B(think)|74.78|66.02|76.33|60.68|63.39|66.11|56.25|66.22|
71
+ |Llama3.1-8B-Instruct|53.16|45.36|51.65|72.47|83.73|71.37|58.54|62.33|
72
+ |Qwen2.5-7B-Instruct|70.88|56.33|75.71|51.51|86.44|81.13|68.30|70.04|
73
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
74
+
75
+ </details>
76
+
77
+ <details>
78
+ <summary>代码任务评测</summary>
79
+
80
+ |模型名称|HumanEval|MBPP|LiveCodeBench|均分|
81
+ |:-:|:-:|:-:|:-:|:-:|
82
+ |Qwen3-8B(think)|84.76|78.60|63.10|75.49|
83
+ |Qwen2.5-7B-Instruct|63.41|68.48|8.15|46.68|
84
+ |Llama3.1-8B-Instruct|84.15|70.82|34.55|63.17|
85
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|76.83|77.04|28.44|60.77|
86
+
87
+ </details>
88
+
89
+ <details>
90
+ <summary>STEM任务评测</summary>
91
+
92
+ |模型名称|GPQA|Math|GSM-8K|均分|
93
+ |:-:|:-:|:-:|:-:|:-:|
94
+ |Qwen3-8B(think)|38.38|83.84|93.03|71.75|
95
+ |Qwen2.5-7B-Instruct|25.25|49.22|85.82|53.43|
96
+ |Llama3.1-8B-Instruct|35.35|73.66|88.48|65.83|
97
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|30.30|72.18|91.36|64.61|
98
+
99
+ </details>
100
+
101
+
102
+ 同时,各早退出分支性能呈现出随等效参数量单调递增的趋势。
103
+
104
+ |模型名称|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|均分|
105
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
106
+ |AI-Flow-Ruyi-7B-E3B-0725<b>(ours)</b>|34.67|17.49|43.99|31.63|47.12|31.20|49.59|36.53|
107
+ |AI-Flow-Ruyi-7B-E4B-0725<b>(ours)</b>|52.63|30.10|45.04|50.94|77.63|61.63|51.99|52.85|
108
+ |AI-Flow-Ruyi-7B-E5B-0725<b>(ours)</b>|61.09|48.54|66.64|75.41|82.03|74.91|61.46|67.15|
109
+ |AI-Flow-Ruyi-7B-E6B-0725<b>(ours)</b>|63.96|53.98|74.95|79.33|81.36|76.64|62.96|70.45|
110
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
111
+
112
+
113
+ <details>
114
+ <summary>[历史]如意-7B预览版</summary>
115
+
116
+ ## 如意-7B预览版
117
+
118
+ 为了让业界能亲身体验能够自由伸缩的“家族模型”,我们开源了如意-7B预览版(AI-Flow-Ruyi-7B-Preview),以展示我们在技术落地上的决心。如意-7B预览版(AI-Flow-Ruyi-7B-Preview)于7月4日发布。其最大参数量分支为7B,可分化出具有等效参数量为3B、4B、5B、6B的早退出分支。其中:
119
+ * 3B、4B分支聚焦简单对话场景,其优势在于响应速度快、资源需求低;
120
+ * 5B、6B分支则针对日常通用任务场景,在性能与响应速度之间寻求平衡;
121
+ * 7B分支主要用于应对复杂问题,在多种能力维度上展现出较为全面的特性,但相对而言响应速度稍缓、资源需求略高。
122
+
123
+ |位点序号|早退出位置|等效模型大小|对应分支代号|场景定位|
124
+ |:-:|:-:|:-:|:-:|:-:|
125
+ |1|11层|3B|AI-Flow-Ruyi-7B-E3B|简单对话|
126
+ |2|15层|4B|AI-Flow-Ruyi-7B-E4B|简单对话|
127
+ |3|19层|5B|AI-Flow-Ruyi-7B-E5B|日常任务|
128
+ |4|23层|6B|AI-Flow-Ruyi-7B-E6B|日常任务|
129
+ |5|27层|7B|AI-Flow-Ruyi-7B-E7B|复杂问题|
130
+
131
+ ### 训练过程
132
+
133
+ 在训练开始前,我们基于Qwen团队预训练的[Qwen2.5-7B](https://arxiv.org/abs/2412.15115)模型(其已在18万亿高质量token上完成预训练),对7B主分支进行了参数初始化;对于早退出分支,其解码器层采用早退出位置的下一层参数进行初始化。
134
+
135
+ 完成初始化后,我们采用**多分支联合预训练**方法,在私有高质量数据集上进行了约4000亿token的继续预训练,构建出如意-7B基座(AI-Flow-Ruyi-7B-Base)。
136
+
137
+ 随后,我们基于约120万条高质量指令数据,对各分支进行了**联合指令遵循微调**,得到如意-7B预览版。
138
+
139
+ ### 性能评测
140
+
141
+ 我们基于[OpenCompass](https://github.com/open-compass/opencompass)及其官方配置文件,以0-shot方式在多个数据集上进行评测。评测结果表明,7B主分支在通用任务性能上与Qwen2.5-7B-Instruct基本持平。
142
+
143
+ <details>
144
+ <summary>通用任务评测</summary>
145
+
146
+ |模型名称|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|均分|
147
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|
148
+ |Qwen3-8B(think)|74.78|66.02|76.33|63.39|60.68|68.24|
149
+ |Qwen2.5-7B-Instruct|70.88|56.33|75.71|86.44|51.51|68.17|
150
+ |Llama-3.1-8B-Instruct|53.16|45.36|51.65|83.73|72.47|61.27|
151
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
152
+
153
+ </details>
154
+
155
+ <details>
156
+ <summary>代码任务评测</summary>
157
+
158
+ |模型名称|MBPP|HumanEval|LiveCodeBench|均分|
159
+ |:-:|:-:|:-:|:-:|:-:|
160
+ |Qwen3-8B(think)|78.60|84.76|63.10|75.49|
161
+ |Qwen2.5-7B-Instruct|70.82|84.15|34.55|63.17|
162
+ |Llama3.1-8B-Instruct|68.48|63.41|8.15|46.68|
163
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|66.93|64.63|30.01|53.86|
164
+
165
+ </details>
166
+
167
+ <details>
168
+ <summary>STEM任务评测</summary>
169
+
170
+ |模型名称|Math|GPQA|GSM-8K|均分|
171
+ |:-:|:-:|:-:|:-:|:-:|
172
+ |Qwen3-8B(think)|83.84|38.38|93.03|71.75|
173
+ |Qwen2.5-7B-Instruct|73.66|35.35|88.48|65.83|
174
+ |Llama3.1-8B-Instruct|49.22|25.25|85.82|53.43|
175
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|44.94|24.75|81.65|50.45|
176
+
177
+ </details>
178
+
179
+
180
+ 同时,各早退出分支性能呈现出随等效参数量单调递增的趋势。
181
+
182
+ |模型名称|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|均分|
183
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|
184
+ |AI-Flow-Ruyi-7B-E3B<b>(ours)</b>|66.93|44.70|19.80|40.00|32.29|40.74|
185
+ |AI-Flow-Ruyi-7B-E4B<b>(ours)</b>|78.86|48.60|26.51|58.98|41.98|50.99|
186
+ |AI-Flow-Ruyi-7B-E5B<b>(ours)</b>|75.34|49.13|33.91|65.76|64.48|57.72|
187
+ |AI-Flow-Ruyi-7B-E6B<b>(ours)</b>|84.58|53.06|33.94|73.22|47.33|58.43|
188
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
189
+
190
+ </details>
191
+
192
+ ## 使用
193
+
194
+ Step 1. 创建并激活虚拟环境
195
+
196
+ ```sh
197
+ conda create -n ruyi python=3.12
198
+ conda activate ruyi
199
+ ```
200
+
201
+ Step 2. 克隆本仓库至本地
202
+
203
+ ```sh
204
+ git clone https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi.git
205
+ cd AI-Flow-Ruyi
206
+ ```
207
+
208
+ Step 3. 由源码安装(PS: flash_attn编译安装较慢,建议移步[官方仓库](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1)下载whl手动安装)
209
+
210
+ ```sh
211
+ pip install -e .
212
+ ```
213
+
214
+ Step 4. 下载模型权重
215
+
216
+ ```sh
217
+ git clone https://www.modelscope.cn/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-0725.git models/AI-Flow-Ruyi-7B-0725
218
+ ```
219
+
220
+ Step 5. 运行Demo
221
+
222
+ ```sh
223
+ python demo.py
224
+ ```
225
+
226
+ <details>
227
+ <summary>查看Demo代码</summary>
228
+
229
+ ```py
230
+ import torch
231
+ from ruyi.global_var import set_global_val
232
+ from transformers import GenerationConfig
233
+ from transformers import AutoModelForCausalLM, AutoTokenizer
234
+
235
+
236
+ model_path = f"models/AI-Flow-Ruyi-7B-0725"
237
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
238
+ model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16).to('cuda')
239
+
240
+
241
+ generation_config = GenerationConfig(
242
+ do_sample=True,
243
+ top_k=30,
244
+ top_p=0.95,
245
+ temperature=0.6,
246
+ repetition_penalty=1.2,
247
+ no_repeat_ngram_size=3,
248
+ max_new_tokens=8192
249
+ )
250
+
251
+ # 输入文本
252
+ messages = [
253
+ {"role": "user", "content": "介绍一下你自己。"},
254
+ ]
255
+
256
+ # 应用 chat_template 模板
257
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
258
+ inputs = tokenizer(prompt, return_tensors="pt")
259
+
260
+ # 模型生成
261
+ with torch.no_grad():
262
+ # 设置早退出点
263
+ # - 11: 第一个早退出点,对应约3B
264
+ # - 15: 第二个早退出点,对应约4B
265
+ # - 19: 第三个早退出点,对应约5B
266
+ # - 23: 第四个早退出点,对应约6B
267
+ # - 27: 第五个早退出点,对应约7B
268
+ set_global_val("early_exit_point", 11)
269
+
270
+ output = model.generate(
271
+ inputs["input_ids"].to('cuda'),
272
+ generation_config=generation_config
273
+ )
274
+
275
+ # 解码并打印结果
276
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
277
+ print(generated_text)
278
+ ```
279
+
280
+ </details>
281
+
282
+ ## 引用
283
+
284
+ ```bibtex
285
+ @misc{an2025aiflowperspectivesscenarios,
286
+ title={AI Flow: Perspectives, Scenarios, and Approaches},
287
+ author={Hongjun An and Wenhan Hu and Sida Huang and Siqi Huang and Ruanjun Li and Yuanzhi Liang and Jiawei Shao and Yiliang Song and Zihan Wang and Cheng Yuan and Chi Zhang and Hongyuan Zhang and Wenhao Zhuang and Xuelong Li},
288
+ year={2025},
289
+ eprint={2506.12479},
290
+ archivePrefix={arXiv},
291
+ primaryClass={cs.AI},
292
+ url={https://arxiv.org/abs/2506.12479},
293
+ }
294
+ ```
295
+
README_en.md ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - Pytorch
4
+ license: Apache License 2.0
5
+ tasks:
6
+ - text-generation
7
+ ---
8
+
9
+ # AI-Flow-Ruyi (如意大模型)
10
+
11
+ <p align="center">
12
+ <img src="assets/AI-Flow-Ruyi-logo.png" width="500" />
13
+ </p>
14
+
15
+ <p align="center">
16
+ <a href="README.md">中文</a> &nbsp | &nbsp <a href="README_en.md">English</a>
17
+ <br>
18
+ 🐱 <a href="https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://www.modelscope.cn/models/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704/">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑&nbsp <a href="https://www.arxiv.org/abs/2506.12479">Paper</a>
19
+ </p>
20
+
21
+ ## News
22
+
23
+ * 🎉🎉[2025/7/25]:AI-Flow-Ruyi-7B released!
24
+ * 🎉🎉[2025/7/4]:TeleAI’s AI Flow is now on the radar of global analyst firm [Omdia](https://omdia.tech.informa.com/om137892/on-the-radar-teleai-brings-intelligence-to-the-network-edge-through-ai-flow) as a generative-AI solution to watch.
25
+ * 🎉🎉[2025/7/4]:AI-Flow-Ruyi-7B-Preview released!
26
+
27
+ ## Introduction
28
+
29
+ **AI-Flow-Ruyi** is a **Familial Model** developed by the AI Flow team of the Institute of Artificial Intelligence (TeleAI), China Telecom. Designed for next-generation "Device-Edge-Cloud" model service architectures, its core innovation lies in **shared familial parameters** across large and small models. Leveraging an **early-exit mechanism**, the system dynamically routes queries to branch models of appropriate parameter sizes based on problem complexity. These branches operate independently while enabling **information sharing** and **seamless transitions** through their shared features. Combined with distributed Device-Edge-Cloud deployment, this facilitates **collaborative inference** within the model family, significantly enhancing distributed reasoning efficiency.
30
+
31
+ ![](assets/ai-flow.png)
32
+ ![](assets/ruyi_model.png)
33
+
34
+ ## AI-Flow-Ruyi-7B
35
+
36
+ To give the community a hands-on experience with a truly elastic “family of models,” we are open-sourcing the Ruyi-7B (AI-Flow-Ruyi-7B), released on 25 July. Its largest branch contains 7 billion parameters and can spawn early-exit sub-networks with effective parameter counts of 3 B, 4 B, 5 B, and 6 B:
37
+
38
+ Key branch specializations:
39
+ * **3B/4B branches**: Optimized for simple dialogue scenarios, delivering **faster response times** with **minimal resource consumption**
40
+ * **5B/6B branches**: Targeting daily general-purpose tasks, **striking a balance** between capability and responsiveness
41
+ * **7B branch**: Designed for complex problem-solving, **exhibiting more well-rounded capabilities** across multiple dimensions – though with **moderately slower inference speeds** and **higher resource demands**
42
+
43
+ |Position No.|Early-Exit Layer|Equivalent Model Size|Branch Designation|Target Scenario|
44
+ |:-:|:-:|:-:|:-:|:-:|
45
+ |1|Layer 11|3B|AI-Flow-Ruyi-7B-E3B|Simple dialogue|
46
+ |2|Layer 15|4B|AI-Flow-Ruyi-7B-E4B|Simple dialogue|
47
+ |3|Layer 19|5B|AI-Flow-Ruyi-7B-E5B|Daily tasks|
48
+ |4|Layer 23|6B|AI-Flow-Ruyi-7B-E6B|Daily tasks|
49
+ |5|Layer 27|7B|AI-Flow-Ruyi-7B-E7B|Complex problems|
50
+
51
+ ### Training process
52
+
53
+ Prior to training initiation, we initialized parameters for the 7B main branch using Qwen team's pre-trained [Qwen2.5-7B](https://arxiv.org/abs/2412.15115) (pre-trained on 18 trillion high-quality tokens). For early-exit branches, decoder layers were initialized with parameters from the subsequent layer of their respective early-exit positions.
54
+
55
+ Following initialization, we conducted **multi-branch joint pre-training** with approximately 400 billion tokens on proprietary high-quality datasets, resulting in the AI-Flow-Ruyi-7B-Base foundation model.
56
+
57
+ Subsequently, we performed **multi-branch joint instruction-following fine-tuning** across all branches using ~0.7 million high-quality instruction samples, yielding the AI-Flow-Ruyi-7B.
58
+
59
+ ### Performance review
60
+
61
+ We conduct a review based on [OpenCompass](https://github.com/open-compass/opencompass) and its official configuration files on multiple datasets in a 0-shot manner.
62
+
63
+ <details>
64
+ <summary>Common tasks review</summary>
65
+
66
+ |Model|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|Mean|
67
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
68
+ |Qwen3-8B(think)|74.78|66.02|76.33|60.68|63.39|66.11|56.25|66.22|
69
+ |Llama3.1-8B-Instruct|53.16|45.36|51.65|72.47|83.73|71.37|58.54|62.33|
70
+ |Qwen2.5-7B-Instruct|70.88|56.33|75.71|51.51|86.44|81.13|68.30|70.04|
71
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
72
+
73
+ </details>
74
+
75
+ <details>
76
+ <summary>Code tasks review</summary>
77
+
78
+ |Model|HumanEval|MBPP|LiveCodeBench|Mean|
79
+ |:-:|:-:|:-:|:-:|:-:|
80
+ |Qwen3-8B(think)|84.76|78.60|63.10|75.49|
81
+ |Qwen2.5-7B-Instruct|63.41|68.48|8.15|46.68|
82
+ |Llama3.1-8B-Instruct|84.15|70.82|34.55|63.17|
83
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|76.83|77.04|28.44|60.77|
84
+
85
+ </details>
86
+
87
+ <details>
88
+ <summary>STEM tasks review</summary>
89
+
90
+ |Model|GPQA|Math|GSM-8K|Mean|
91
+ |:-:|:-:|:-:|:-:|:-:|
92
+ |Qwen3-8B(think)|38.38|83.84|93.03|71.75|
93
+ |Qwen2.5-7B-Instruct|25.25|49.22|85.82|53.43|
94
+ |Llama3.1-8B-Instruct|35.35|73.66|88.48|65.83|
95
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|30.30|72.18|91.36|64.61|
96
+
97
+ </details>
98
+
99
+
100
+ At the same time, the performance of each early exit branch shows a monotonically increasing trend with the number of equivalent parameters.
101
+
102
+ |Model|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|Mean|
103
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
104
+ |AI-Flow-Ruyi-7B-E3B-0725<b>(ours)</b>|34.67|17.49|43.99|31.63|47.12|31.20|49.59|36.53|
105
+ |AI-Flow-Ruyi-7B-E4B-0725<b>(ours)</b>|52.63|30.10|45.04|50.94|77.63|61.63|51.99|52.85|
106
+ |AI-Flow-Ruyi-7B-E5B-0725<b>(ours)</b>|61.09|48.54|66.64|75.41|82.03|74.91|61.46|67.15|
107
+ |AI-Flow-Ruyi-7B-E6B-0725<b>(ours)</b>|63.96|53.98|74.95|79.33|81.36|76.64|62.96|70.45|
108
+ |AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
109
+
110
+
111
+ <details>
112
+ <summary>[History]AI-Flow-Ruyi-7B-Preview</summary>
113
+
114
+ ## AI-Flow-Ruyi-7B-Preview
115
+
116
+ To give the community a hands-on experience with a truly elastic “family of models,” we are open-sourcing the Ruyi-7B Preview (AI-Flow-Ruyi-7B-Preview), released on 4 July. Its largest branch contains 7 billion parameters and can spawn early-exit sub-networks with effective parameter counts of 3 B, 4 B, 5 B, and 6 B:
117
+
118
+ Key branch specializations:
119
+ * **3B/4B branches**: Optimized for simple dialogue scenarios, delivering **faster response times** with **minimal resource consumption**
120
+ * **5B/6B branches**: Targeting daily general-purpose tasks, **striking a balance** between capability and responsiveness
121
+ * **7B branch**: Designed for complex problem-solving, **exhibiting more well-rounded capabilities** across multiple dimensions – though with **moderately slower inference speeds** and **higher resource demands**
122
+
123
+ |Position No.|Early-Exit Layer|Equivalent Model Size|Branch Designation|Target Scenario|
124
+ |:-:|:-:|:-:|:-:|:-:|
125
+ |1|Layer 11|3B|AI-Flow-Ruyi-7B-E3B|Simple dialogue|
126
+ |2|Layer 15|4B|AI-Flow-Ruyi-7B-E4B|Simple dialogue|
127
+ |3|Layer 19|5B|AI-Flow-Ruyi-7B-E5B|Daily tasks|
128
+ |4|Layer 23|6B|AI-Flow-Ruyi-7B-E6B|Daily tasks|
129
+ |5|Layer 27|7B|AI-Flow-Ruyi-7B-E7B|Complex problems|
130
+
131
+ ### Training process
132
+
133
+ Prior to training initiation, we initialized parameters for the 7B main branch using Qwen team's pre-trained [Qwen2.5-7B](https://arxiv.org/abs/2412.15115) (pre-trained on 18 trillion high-quality tokens). For early-exit branches, decoder layers were initialized with parameters from the subsequent layer of their respective early-exit positions.
134
+
135
+ Following initialization, we conducted **multi-branch joint pre-training** with approximately 400 billion tokens on proprietary high-quality datasets, resulting in the AI-Flow-Ruyi-7B-Base foundation model.
136
+
137
+ Subsequently, we performed **multi-branch joint instruction-following fine-tuning** across all branches using ~1.2 million high-quality instruction samples, yielding the AI-Flow-Ruyi-7B-Preview.
138
+
139
+ ### Performance review
140
+
141
+ We conduct a review based on [OpenCompass](https://github.com/open-compass/opencompass) and its official configuration files on multiple datasets in a 0-shot manner. The evaluation results show that the 7B master branch is basically equal to Qwen2.5-7B-Instruct in terms of general-purpose task performance.
142
+
143
+ <details>
144
+ <summary>Common tasks review</summary>
145
+
146
+ |Model|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|Mean|
147
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|
148
+ |Qwen3-8B(think)|74.78|66.02|76.33|63.39|60.68|68.24|
149
+ |Qwen2.5-7B-Instruct|70.88|56.33|75.71|86.44|51.51|68.17|
150
+ |Llama-3.1-8B-Instruct|53.16|45.36|51.65|83.73|72.47|61.27|
151
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
152
+
153
+ </details>
154
+
155
+ <details>
156
+ <summary>Code tasks review</summary>
157
+
158
+ |Model|MBPP|HumanEval|LiveCodeBench|Mean|
159
+ |:-:|:-:|:-:|:-:|:-:|
160
+ |Qwen3-8B(think)|78.60|84.76|63.10|75.49|
161
+ |Qwen2.5-7B-Instruct|70.82|84.15|34.55|63.17|
162
+ |Llama3.1-8B-Instruct|68.48|63.41|8.15|46.68|
163
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|66.93|64.63|30.01|53.86|
164
+
165
+ </details>
166
+
167
+ <details>
168
+ <summary>STEM tasks review</summary>
169
+
170
+ |Model|Math|GPQA|GSM-8K|Mean|
171
+ |:-:|:-:|:-:|:-:|:-:|
172
+ |Qwen3-8B(think)|83.84|38.38|93.03|71.75|
173
+ |Qwen2.5-7B-Instruct|73.66|35.35|88.48|65.83|
174
+ |Llama3.1-8B-Instruct|49.22|25.25|85.82|53.43|
175
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|44.94|24.75|81.65|50.45|
176
+
177
+ </details>
178
+
179
+
180
+ At the same time, the performance of each early exit branch shows a monotonically increasing trend with the number of equivalent parameters.
181
+
182
+ |Model|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|Mean|
183
+ |:-:|:-:|:-:|:-:|:-:|:-:|:-:|
184
+ |AI-Flow-Ruyi-7B-E3B<b>(ours)</b>|66.93|44.70|19.80|40.00|32.29|40.74|
185
+ |AI-Flow-Ruyi-7B-E4B<b>(ours)</b>|78.86|48.60|26.51|58.98|41.98|50.99|
186
+ |AI-Flow-Ruyi-7B-E5B<b>(ours)</b>|75.34|49.13|33.91|65.76|64.48|57.72|
187
+ |AI-Flow-Ruyi-7B-E6B<b>(ours)</b>|84.58|53.06|33.94|73.22|47.33|58.43|
188
+ |AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
189
+
190
+ </details>
191
+
192
+ ## Usage
193
+
194
+ Step 1. Create and activate a virtual environment
195
+
196
+ ```sh
197
+ conda create -n ruyi python=3.12
198
+ conda activate ruyi
199
+ ```
200
+
201
+ Step 2. Clone this warehouse to local
202
+
203
+ ```sh
204
+ git clone https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi.git
205
+ cd AI-Flow-Ruyi
206
+ ```
207
+
208
+ Step 3. Installation from source (PS: flash_attn compilation and installation is slow, it is recommended to move to the [official repository](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1) to download whl manual installation)
209
+
210
+ ```sh
211
+ pip install -e .
212
+ ```
213
+
214
+ Step 4. Download model weights
215
+
216
+ ```sh
217
+ git clone https://www.modelscope.cn/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-0725.git models/AI-Flow-Ruyi-7B-0725
218
+ ```
219
+
220
+ Step 5. Run Demo
221
+
222
+ ```sh
223
+ python demo.py
224
+ ```
225
+
226
+ <details>
227
+ <summary>View demo code</summary>
228
+
229
+ ```py
230
+ import torch
231
+ from ruyi.global_var import set_global_val
232
+ from transformers import GenerationConfig
233
+ from transformers import AutoModelForCausalLM, AutoTokenizer
234
+
235
+
236
+ model_path = f"models/AI-Flow-Ruyi-7B-0725"
237
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
238
+ model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16).to('cuda')
239
+
240
+
241
+ generation_config = GenerationConfig(
242
+ do_sample=True,
243
+ top_k=30,
244
+ top_p=0.95,
245
+ temperature=0.6,
246
+ repetition_penalty=1.2,
247
+ no_repeat_ngram_size=3,
248
+ max_new_tokens=8192
249
+ )
250
+
251
+ # input text
252
+ messages = [
253
+ {"role": "user", "content": "Introduce yourself."},
254
+ ]
255
+
256
+ # Apply chat_template template
257
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
258
+ inputs = tokenizer(prompt, return_tensors="pt")
259
+
260
+ # Model Generation
261
+ with torch.no_grad():
262
+ # Setting the early exit point
263
+ # - 11: First early exit point corresponding to about 3B.
264
+ # - 15: second early exit point, corresponding to approximately 4B.
265
+ # - 19: third early exit point, corresponding to about 5B.
266
+ # - 23: fourth early exit point, corresponding to approximately 6B.
267
+ # - 27: fifth early exit point, corresponding to about 7B.
268
+ set_global_val("early_exit_point", 11)
269
+
270
+ output = model.generate(
271
+ inputs["input_ids"].to('cuda'),
272
+ generation_config=generation_config
273
+ )
274
+
275
+ # Decode and print results
276
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
277
+ print(generated_text)
278
+ ```
279
+
280
+ </details>
281
+
282
+ ## Citation
283
+
284
+ ```bibtex
285
+ @misc{an2025aiflowperspectivesscenarios,
286
+ title={AI Flow: Perspectives, Scenarios, and Approaches},
287
+ author={Hongjun An and Wenhan Hu and Sida Huang and Siqi Huang and Ruanjun Li and Yuanzhi Liang and Jiawei Shao and Yiliang Song and Zihan Wang and Cheng Yuan and Chi Zhang and Hongyuan Zhang and Wenhao Zhuang and Xuelong Li},
288
+ year={2025},
289
+ eprint={2506.12479},
290
+ archivePrefix={arXiv},
291
+ primaryClass={cs.AI},
292
+ url={https://arxiv.org/abs/2506.12479},
293
+ }
294
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
assets/AI-Flow-Ruyi-logo.png ADDED

Git LFS Details

  • SHA256: be263e5ed0f2147f0e7e44f06570616ecb7b16c6d72b9a944d0f7f7a3280503f
  • Pointer size: 131 Bytes
  • Size of remote file: 314 kB
assets/ai-flow.png ADDED

Git LFS Details

  • SHA256: 10e2779ae99bc1c5430a8453462c2263ccf15e27a01e7df58a22c67309ce3fed
  • Pointer size: 131 Bytes
  • Size of remote file: 178 kB
assets/logo.png ADDED
assets/ruyi_model.png ADDED
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RuyiQwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_ruyi_qwen2.RuyiQwen2Config",
8
+ "AutoModel": "modeling_ruyi_qwen2.RuyiQwen2Model",
9
+ "AutoModelForCausalLM": "modeling_ruyi_qwen2.RuyiQwen2ForCausalLM"
10
+ },
11
+ "bos_token_id": 151643,
12
+ "default_early_exit_point": 29,
13
+ "early_exit_points": [
14
+ 11,
15
+ 15,
16
+ 19,
17
+ 23,
18
+ 27
19
+ ],
20
+ "eos_token_id": 151643,
21
+ "hidden_act": "silu",
22
+ "hidden_size": 3584,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 18944,
25
+ "max_position_embeddings": 131072,
26
+ "max_window_layers": 28,
27
+ "model_type": "ruyi_qwen2",
28
+ "num_attention_heads": 28,
29
+ "num_hidden_layers": 28,
30
+ "num_key_value_heads": 4,
31
+ "rms_norm_eps": 1e-06,
32
+ "rope_scaling": null,
33
+ "rope_theta": 1000000.0,
34
+ "shared_heads": false,
35
+ "sliding_window": 131072,
36
+ "tie_word_embeddings": false,
37
+ "torch_dtype": "bfloat16",
38
+ "transformers_version": "4.51.3",
39
+ "use_cache": true,
40
+ "use_mrope": false,
41
+ "use_sliding_window": false,
42
+ "vocab_size": 152064
43
+ }
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework":"Pytorch","task":"text-generation"}
configuration_ruyi_qwen2.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Ref: https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2/configuration_qwen2.py
3
+ # Copyright (c) Institute of Artificial Intelligence (TeleAI), China Telecom, 2025. All Rights Reserved.
4
+ """RuyiQwen2 model configuration"""
5
+
6
+ import os
7
+ import shutil
8
+
9
+ from transformers.configuration_utils import PretrainedConfig
10
+ from transformers.modeling_rope_utils import rope_config_validation
11
+ from transformers.utils import logging
12
+
13
+
14
+ logger = logging.get_logger(__name__)
15
+
16
+
17
+ class RuyiQwen2Config(PretrainedConfig):
18
+
19
+ model_type = "ruyi_qwen2"
20
+ keys_to_ignore_at_inference = ["past_key_values"]
21
+
22
+ # Default tensor parallel plan for base model `RuyiQwen2`
23
+ base_model_tp_plan = {
24
+ "layers.*.self_attn.q_proj": "colwise",
25
+ "layers.*.self_attn.k_proj": "colwise",
26
+ "layers.*.self_attn.v_proj": "colwise",
27
+ "layers.*.self_attn.o_proj": "rowwise",
28
+ "layers.*.mlp.gate_proj": "colwise",
29
+ "layers.*.mlp.up_proj": "colwise",
30
+ "layers.*.mlp.down_proj": "rowwise",
31
+ "eelayers.*.self_attn.q_proj": "colwise",
32
+ "eelayers.*.self_attn_k_proj": "colwise",
33
+ "eelayers.*.self_attn_v_proj": "colwise",
34
+ "eelayers.*.self_attn_o_proj": "rowwise",
35
+ "eelayers.*.mlp.gate_proj": "colwise",
36
+ "eelayers.*.mlp.up_proj": "colwise",
37
+ "eelayers.*.mlp.down_proj": "rowwise"
38
+ }
39
+ base_model_pp_plan = {
40
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
41
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
42
+ "eelayers": (["hidden_states", "attention_mask"], ["hidden_states"]),
43
+ "norm": (["hidden_states"], ["hidden_states"]),
44
+ }
45
+
46
+ def __init__(
47
+ self,
48
+ vocab_size=151936,
49
+ hidden_size=4096,
50
+ intermediate_size=22016,
51
+ num_hidden_layers=32,
52
+ num_attention_heads=32,
53
+ num_key_value_heads=32,
54
+ hidden_act="silu",
55
+ max_position_embeddings=32768,
56
+ initializer_range=0.02,
57
+ rms_norm_eps=1e-6,
58
+ use_cache=True,
59
+ tie_word_embeddings=False,
60
+ rope_theta=10000.0,
61
+ rope_scaling=None,
62
+ use_sliding_window=False,
63
+ sliding_window=4096,
64
+ max_window_layers=28,
65
+ attention_dropout=0.0,
66
+
67
+ shared_heads=False,
68
+ default_early_exit_point=-1, # [0, num_hidden_layers-1], -1 = num_hidden_layers - 1
69
+ early_exit_points=list(range(1, 32, 2)),
70
+ **kwargs,
71
+ ):
72
+ self.vocab_size = vocab_size
73
+ self.max_position_embeddings = max_position_embeddings
74
+ self.hidden_size = hidden_size
75
+ self.intermediate_size = intermediate_size
76
+ self.num_hidden_layers = num_hidden_layers
77
+ self.num_attention_heads = num_attention_heads
78
+ self.use_sliding_window = use_sliding_window
79
+ self.sliding_window = sliding_window # we check `use_sliding_window` in the modeling code
80
+ self.max_window_layers = max_window_layers
81
+
82
+ # for backward compatibility
83
+ if num_key_value_heads is None:
84
+ num_key_value_heads = num_attention_heads
85
+
86
+ self.num_key_value_heads = num_key_value_heads
87
+ self.hidden_act = hidden_act
88
+ self.initializer_range = initializer_range
89
+ self.rms_norm_eps = rms_norm_eps
90
+ self.use_cache = use_cache
91
+ self.rope_theta = rope_theta
92
+ self.rope_scaling = rope_scaling
93
+ self.attention_dropout = attention_dropout
94
+ # Validate the correctness of rotary position embeddings parameters
95
+ # BC: if there is a 'type' field, move it to 'rope_type'.
96
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
97
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
98
+ rope_config_validation(self)
99
+
100
+ self.shared_heads = shared_heads
101
+ self.default_early_exit_point = default_early_exit_point
102
+ self.early_exit_points = early_exit_points
103
+ self.auto_map = {
104
+ "AutoConfig": "configuration_ruyi_qwen2.RuyiQwen2Config",
105
+ "AutoModel": "modeling_ruyi_qwen2.RuyiQwen2Model",
106
+ "AutoModelForCausalLM": "modeling_ruyi_qwen2.RuyiQwen2ForCausalLM"
107
+ }
108
+
109
+ super().__init__(
110
+ tie_word_embeddings=tie_word_embeddings,
111
+ **kwargs,
112
+ )
113
+
114
+ def save_pretrained(self, save_directory, **kwargs):
115
+ super().save_pretrained(save_directory, **kwargs)
116
+ shutil.copyfile(
117
+ os.path.abspath(__file__),
118
+ os.path.join(save_directory, "configuration_ruyi_qwen2.py")
119
+ )
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151643,
5
+ "transformers_version": "4.51.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f291d6752ea2d02d22d079e6d418f15981ab41ab03793295c94701307a55b201
3
+ size 4877660776
model-00002-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f3895ef7a5cff5133a1718322b65afec36f295e4c33d1714c9f205b1817b432
3
+ size 4932751008
model-00003-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:930933e0a2e620b43396519a76252c82efd2b973df5a41b4f33481e290af15a7
3
+ size 4991495896
model-00004-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8234b0394a3d20b8a059452190346398c3703e50cafc51cd20e80bc3079b9f0
3
+ size 4473850832
model-00005-of-00005.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a21aed606e27819560aae39827a08ac637b40a29e84d5d2bbea38465cf67309
3
+ size 2179989736
model.safetensors.index.json ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 21455703040
4
+ },
5
+ "weight_map": {
6
+ "lm_head.0.weight": "model-00004-of-00005.safetensors",
7
+ "lm_head.1.weight": "model-00004-of-00005.safetensors",
8
+ "lm_head.2.weight": "model-00004-of-00005.safetensors",
9
+ "lm_head.3.weight": "model-00005-of-00005.safetensors",
10
+ "lm_head.4.weight": "model-00005-of-00005.safetensors",
11
+ "model.eelayers.0.input_layernorm.weight": "model-00003-of-00005.safetensors",
12
+ "model.eelayers.0.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
13
+ "model.eelayers.0.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
14
+ "model.eelayers.0.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
15
+ "model.eelayers.0.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
16
+ "model.eelayers.0.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
17
+ "model.eelayers.0.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
18
+ "model.eelayers.0.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
19
+ "model.eelayers.0.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
20
+ "model.eelayers.0.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
21
+ "model.eelayers.0.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
22
+ "model.eelayers.0.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
23
+ "model.eelayers.1.input_layernorm.weight": "model-00004-of-00005.safetensors",
24
+ "model.eelayers.1.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
25
+ "model.eelayers.1.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
26
+ "model.eelayers.1.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
27
+ "model.eelayers.1.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
28
+ "model.eelayers.1.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
29
+ "model.eelayers.1.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
30
+ "model.eelayers.1.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
31
+ "model.eelayers.1.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
32
+ "model.eelayers.1.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
33
+ "model.eelayers.1.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
34
+ "model.eelayers.1.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
35
+ "model.eelayers.2.input_layernorm.weight": "model-00004-of-00005.safetensors",
36
+ "model.eelayers.2.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
37
+ "model.eelayers.2.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
38
+ "model.eelayers.2.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
39
+ "model.eelayers.2.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
40
+ "model.eelayers.2.self_attn.k_proj.bias": "model-00004-of-00005.safetensors",
41
+ "model.eelayers.2.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
42
+ "model.eelayers.2.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
43
+ "model.eelayers.2.self_attn.q_proj.bias": "model-00004-of-00005.safetensors",
44
+ "model.eelayers.2.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
45
+ "model.eelayers.2.self_attn.v_proj.bias": "model-00004-of-00005.safetensors",
46
+ "model.eelayers.2.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
47
+ "model.eelayers.3.input_layernorm.weight": "model-00004-of-00005.safetensors",
48
+ "model.eelayers.3.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
49
+ "model.eelayers.3.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
50
+ "model.eelayers.3.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
51
+ "model.eelayers.3.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
52
+ "model.eelayers.3.self_attn.k_proj.bias": "model-00004-of-00005.safetensors",
53
+ "model.eelayers.3.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
54
+ "model.eelayers.3.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
55
+ "model.eelayers.3.self_attn.q_proj.bias": "model-00004-of-00005.safetensors",
56
+ "model.eelayers.3.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
57
+ "model.eelayers.3.self_attn.v_proj.bias": "model-00004-of-00005.safetensors",
58
+ "model.eelayers.3.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
59
+ "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
60
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
61
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
62
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
63
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
64
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
65
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
66
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
67
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
68
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
69
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
70
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
71
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
72
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
73
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
74
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
75
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
76
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
77
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
78
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
79
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
80
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
81
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
82
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
83
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
84
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
85
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
86
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
87
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
88
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
89
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
90
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
91
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
92
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
93
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
94
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
95
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
96
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
97
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
98
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
99
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
100
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
101
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
102
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
103
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
104
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
105
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
106
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
107
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
108
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
109
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
110
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
111
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
112
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
113
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
114
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
115
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
116
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
117
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
118
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
119
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
120
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
121
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
122
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
123
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
124
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
125
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
126
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
127
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
128
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
129
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
130
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
131
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
132
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
133
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
134
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
135
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
136
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
137
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
138
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
139
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
140
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
141
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
142
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
143
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
144
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00005.safetensors",
145
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
146
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
147
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
148
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
149
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
150
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
151
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
152
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
153
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
154
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
155
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
156
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00005.safetensors",
157
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
158
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
159
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
160
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
161
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
162
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
163
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
164
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
165
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
166
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
167
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
168
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00005.safetensors",
169
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
170
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
171
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
172
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
173
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
174
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
175
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
176
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
177
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
178
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
179
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
180
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
181
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
182
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
183
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
184
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
185
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
186
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
187
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
188
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
189
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
190
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
191
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
192
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00005.safetensors",
193
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
194
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
195
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
196
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
197
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
198
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
199
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
200
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
201
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
202
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
203
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
204
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
205
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
206
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
207
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
208
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
209
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
210
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
211
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
212
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
213
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
214
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
215
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
216
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00005.safetensors",
217
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
218
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
219
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
220
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
221
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
222
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
223
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
224
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
225
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
226
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
227
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
228
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00005.safetensors",
229
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
230
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
231
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
232
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
233
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
234
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
235
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
236
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
237
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
238
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
239
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
240
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
241
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
242
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
243
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
244
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
245
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
246
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
247
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
248
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
249
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
250
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
251
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
252
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
253
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
254
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
255
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
256
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
257
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
258
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
259
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
260
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
261
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
262
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
263
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
264
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
265
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
266
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
267
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
268
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
269
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
270
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
271
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
272
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
273
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
274
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
275
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
276
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00005.safetensors",
277
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
278
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
279
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
280
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
281
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
282
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
283
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
284
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
285
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
286
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
287
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
288
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00005.safetensors",
289
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
290
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
291
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
292
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
293
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
294
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
295
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
296
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
297
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
298
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
299
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
300
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00005.safetensors",
301
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
302
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
303
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
304
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
305
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
306
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
307
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
308
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
309
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
310
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
311
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
312
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
313
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
314
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
315
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
316
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
317
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
318
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
319
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
320
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
321
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
322
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
323
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
324
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
325
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
326
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
327
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
328
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
329
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
330
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
331
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
332
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
333
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
334
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
335
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
336
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
337
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
338
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
339
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
340
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
341
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
342
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
343
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
344
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
345
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
346
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
347
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
348
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00005.safetensors",
349
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
350
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
351
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
352
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
353
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
354
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
355
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
356
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
357
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
358
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
359
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
360
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00005.safetensors",
361
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
362
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
363
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
364
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
365
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
366
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
367
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
368
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
369
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
370
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
371
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
372
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
373
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
374
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
375
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
376
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
377
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
378
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
379
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
380
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
381
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
382
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
383
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
384
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
385
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
386
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
387
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
388
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
389
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
390
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
391
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
392
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
393
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
394
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
395
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
396
+ "model.norms.0.weight": "model-00004-of-00005.safetensors",
397
+ "model.norms.1.weight": "model-00004-of-00005.safetensors",
398
+ "model.norms.2.weight": "model-00004-of-00005.safetensors",
399
+ "model.norms.3.weight": "model-00004-of-00005.safetensors",
400
+ "model.norms.4.weight": "model-00004-of-00005.safetensors"
401
+ }
402
+ }
modeling_ruyi_qwen2.py ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Ref: https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2/modeling_qwen2.py
3
+ # Copyright (c) Institute of Artificial Intelligence (TeleAI), China Telecom, 2025. All Rights Reserved.
4
+ """RuyiQwen2 model"""
5
+
6
+ import os
7
+ import shutil
8
+
9
+ from functools import partial
10
+ from typing import Callable, Optional, Tuple, Union
11
+ from itertools import chain
12
+
13
+ import torch
14
+ from torch import nn
15
+
16
+ from transformers.activations import ACT2FN
17
+ from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
18
+ from transformers.generation import GenerationMixin
19
+ from transformers.integrations import use_kernel_forward_from_hub
20
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
21
+ from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
22
+ from transformers.modeling_outputs import (
23
+ BaseModelOutputWithPast,
24
+ CausalLMOutputWithPast,
25
+ )
26
+ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
27
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
28
+ from transformers.processing_utils import Unpack
29
+ from transformers.utils import (
30
+ LossKwargs,
31
+ can_return_tuple,
32
+ is_torch_flex_attn_available,
33
+ logging,
34
+ )
35
+ from .configuration_ruyi_qwen2 import RuyiQwen2Config
36
+
37
+ from ruyi.global_var import set_global_val, get_global_val
38
+
39
+
40
+ if is_torch_flex_attn_available():
41
+ from torch.nn.attention.flex_attention import BlockMask
42
+ from transformers.integrations.flex_attention import make_flex_block_causal_mask
43
+
44
+
45
+ logger = logging.get_logger(__name__)
46
+
47
+
48
+ class RuyiQwen2MLP(nn.Module):
49
+ def __init__(self, config):
50
+ super().__init__()
51
+ self.config = config
52
+ self.hidden_size = config.hidden_size
53
+ self.intermediate_size = config.intermediate_size
54
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
55
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
56
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
57
+ self.act_fn = ACT2FN[config.hidden_act]
58
+
59
+ def forward(self, x):
60
+ down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
61
+ return down_proj
62
+
63
+
64
+ def rotate_half(x):
65
+ """Rotates half the hidden dims of the input."""
66
+ x1 = x[..., : x.shape[-1] // 2]
67
+ x2 = x[..., x.shape[-1] // 2 :]
68
+ return torch.cat((-x2, x1), dim=-1)
69
+
70
+
71
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
72
+ """Applies Rotary Position Embedding to the query and key tensors.
73
+
74
+ Args:
75
+ q (`torch.Tensor`): The query tensor.
76
+ k (`torch.Tensor`): The key tensor.
77
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
78
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
79
+ position_ids (`torch.Tensor`, *optional*):
80
+ Deprecated and unused.
81
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
82
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
83
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
84
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
85
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
86
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
87
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
88
+ Returns:
89
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
90
+ """
91
+ cos = cos.unsqueeze(unsqueeze_dim)
92
+ sin = sin.unsqueeze(unsqueeze_dim)
93
+ q_embed = (q * cos) + (rotate_half(q) * sin)
94
+ k_embed = (k * cos) + (rotate_half(k) * sin)
95
+ return q_embed, k_embed
96
+
97
+
98
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
99
+ """
100
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
101
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
102
+ """
103
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
104
+ if n_rep == 1:
105
+ return hidden_states
106
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
107
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
108
+
109
+
110
+ def eager_attention_forward(
111
+ module: nn.Module,
112
+ query: torch.Tensor,
113
+ key: torch.Tensor,
114
+ value: torch.Tensor,
115
+ attention_mask: Optional[torch.Tensor],
116
+ scaling: float,
117
+ dropout: float = 0.0,
118
+ **kwargs,
119
+ ):
120
+ key_states = repeat_kv(key, module.num_key_value_groups)
121
+ value_states = repeat_kv(value, module.num_key_value_groups)
122
+
123
+ attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
124
+ if attention_mask is not None:
125
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
126
+ attn_weights = attn_weights + causal_mask
127
+
128
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
129
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
130
+ attn_output = torch.matmul(attn_weights, value_states)
131
+ attn_output = attn_output.transpose(1, 2).contiguous()
132
+
133
+ return attn_output, attn_weights
134
+
135
+
136
+ class RuyiQwen2Attention(nn.Module):
137
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
138
+
139
+ def __init__(self, config: RuyiQwen2Config, layer_idx: int):
140
+ super().__init__()
141
+ self.config = config
142
+ self.layer_idx = layer_idx
143
+ self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
144
+ self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
145
+ self.scaling = self.head_dim**-0.5
146
+ self.attention_dropout = config.attention_dropout
147
+ self.is_causal = True
148
+ self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
149
+ self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
150
+ self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
151
+ self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
152
+
153
+ def forward(
154
+ self,
155
+ hidden_states: torch.Tensor,
156
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
157
+ attention_mask: Optional[torch.Tensor],
158
+ past_key_value: Optional[Cache] = None,
159
+ cache_position: Optional[torch.LongTensor] = None,
160
+ **kwargs: Unpack[FlashAttentionKwargs],
161
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
162
+ input_shape = hidden_states.shape[:-1]
163
+ hidden_shape = (*input_shape, -1, self.head_dim)
164
+
165
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
166
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
167
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
168
+
169
+ cos, sin = position_embeddings
170
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
171
+
172
+ if past_key_value is not None:
173
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
174
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
175
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
176
+
177
+ sliding_window = None
178
+ if (
179
+ self.config.use_sliding_window
180
+ and getattr(self.config, "sliding_window", None) is not None
181
+ and self.layer_idx >= self.config.max_window_layers
182
+ ):
183
+ sliding_window = self.config.sliding_window
184
+
185
+ attention_interface: Callable = eager_attention_forward
186
+ if self.config._attn_implementation != "eager":
187
+ if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
188
+ logger.warning_once(
189
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
190
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
191
+ )
192
+ else:
193
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
194
+
195
+ attn_output, attn_weights = attention_interface(
196
+ self,
197
+ query_states,
198
+ key_states,
199
+ value_states,
200
+ attention_mask,
201
+ dropout=0.0 if not self.training else self.attention_dropout,
202
+ scaling=self.scaling,
203
+ sliding_window=sliding_window, # main diff with Llama
204
+ **kwargs,
205
+ )
206
+
207
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
208
+ attn_output = self.o_proj(attn_output)
209
+ return attn_output, attn_weights
210
+
211
+
212
+ @use_kernel_forward_from_hub("RMSNorm")
213
+ class RuyiQwen2RMSNorm(nn.Module):
214
+ def __init__(self, hidden_size, eps=1e-6):
215
+ """
216
+ RuyiQwen2RMSNorm is equivalent to T5LayerNorm
217
+ """
218
+ super().__init__()
219
+ self.weight = nn.Parameter(torch.ones(hidden_size))
220
+ self.variance_epsilon = eps
221
+
222
+ def forward(self, hidden_states):
223
+ input_dtype = hidden_states.dtype
224
+ hidden_states = hidden_states.to(torch.float32)
225
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
226
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
227
+ return self.weight * hidden_states.to(input_dtype)
228
+
229
+ def extra_repr(self):
230
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
231
+
232
+
233
+ class RuyiQwen2DecoderLayer(nn.Module):
234
+ def __init__(self, config: RuyiQwen2Config, layer_idx: int):
235
+ super().__init__()
236
+ self.layer_idx = layer_idx
237
+ self.hidden_size = config.hidden_size
238
+ self.self_attn = RuyiQwen2Attention(config=config, layer_idx=layer_idx)
239
+ self.mlp = RuyiQwen2MLP(config)
240
+ self.input_layernorm = RuyiQwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
241
+ self.post_attention_layernorm = RuyiQwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
242
+ if config.sliding_window and config._attn_implementation != "flash_attention_2":
243
+ logger.warning_once(
244
+ f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
245
+ "unexpected results may be encountered."
246
+ )
247
+
248
+ def forward(
249
+ self,
250
+ hidden_states: torch.Tensor,
251
+ attention_mask: Optional[torch.Tensor] = None,
252
+ position_ids: Optional[torch.LongTensor] = None,
253
+ past_key_value: Optional[Cache] = None,
254
+ output_attentions: Optional[bool] = False,
255
+ use_cache: Optional[bool] = False,
256
+ cache_position: Optional[torch.LongTensor] = None,
257
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
258
+ **kwargs: Unpack[FlashAttentionKwargs],
259
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
260
+ residual = hidden_states
261
+ hidden_states = self.input_layernorm(hidden_states)
262
+
263
+ # Self Attention
264
+ hidden_states, self_attn_weights = self.self_attn(
265
+ hidden_states=hidden_states,
266
+ attention_mask=attention_mask,
267
+ position_ids=position_ids,
268
+ past_key_value=past_key_value,
269
+ output_attentions=output_attentions,
270
+ use_cache=use_cache,
271
+ cache_position=cache_position,
272
+ position_embeddings=position_embeddings,
273
+ **kwargs,
274
+ )
275
+ hidden_states = residual + hidden_states
276
+
277
+ # Fully Connected
278
+ residual = hidden_states
279
+ hidden_states = self.post_attention_layernorm(hidden_states)
280
+ hidden_states = self.mlp(hidden_states)
281
+ hidden_states = residual + hidden_states
282
+
283
+ outputs = (hidden_states,)
284
+ if output_attentions:
285
+ outputs += (self_attn_weights,)
286
+
287
+ return outputs
288
+
289
+
290
+ class RuyiQwen2PreTrainedModel(PreTrainedModel):
291
+ config_class = RuyiQwen2Config
292
+ base_model_prefix = "model"
293
+ supports_gradient_checkpointing = True
294
+ _no_split_modules = ["RuyiQwen2DecoderLayer"]
295
+ _skip_keys_device_placement = ["past_key_values"]
296
+ _supports_flash_attn_2 = True
297
+ _supports_sdpa = True
298
+ _supports_flex_attn = True
299
+ _supports_cache_class = True
300
+ _supports_quantized_cache = True
301
+ _supports_static_cache = True
302
+ _supports_attention_backend = True
303
+
304
+ def _init_weights(self, module):
305
+ std = self.config.initializer_range
306
+ if isinstance(module, nn.Linear):
307
+ module.weight.data.normal_(mean=0.0, std=std)
308
+ if module.bias is not None:
309
+ module.bias.data.zero_()
310
+ elif isinstance(module, nn.Embedding):
311
+ module.weight.data.normal_(mean=0.0, std=std)
312
+ if module.padding_idx is not None:
313
+ module.weight.data[module.padding_idx].zero_()
314
+ elif isinstance(module, RuyiQwen2RMSNorm):
315
+ module.weight.data.fill_(1.0)
316
+
317
+
318
+ class RuyiQwen2RotaryEmbedding(nn.Module):
319
+ def __init__(self, config: RuyiQwen2Config, device=None):
320
+ super().__init__()
321
+ # BC: "rope_type" was originally "type"
322
+ if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
323
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
324
+ else:
325
+ self.rope_type = "default"
326
+ self.max_seq_len_cached = config.max_position_embeddings
327
+ self.original_max_seq_len = config.max_position_embeddings
328
+
329
+ self.config = config
330
+ self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
331
+
332
+ inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
333
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
334
+ self.original_inv_freq = self.inv_freq
335
+
336
+ @torch.no_grad()
337
+ @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
338
+ def forward(self, x, position_ids):
339
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
340
+ position_ids_expanded = position_ids[:, None, :].float()
341
+
342
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
343
+ with torch.autocast(device_type=device_type, enabled=False): # Force float32
344
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
345
+ emb = torch.cat((freqs, freqs), dim=-1)
346
+ cos = emb.cos() * self.attention_scaling
347
+ sin = emb.sin() * self.attention_scaling
348
+
349
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
350
+
351
+
352
+ class RuyiQwen2Model(RuyiQwen2PreTrainedModel):
353
+ """
354
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`RuyiQwen2DecoderLayer`]
355
+
356
+ Args:
357
+ config: RuyiQwen2Config
358
+ """
359
+
360
+ def __init__(self, config: RuyiQwen2Config):
361
+ super().__init__(config)
362
+ self.padding_idx = config.pad_token_id
363
+ self.vocab_size = config.vocab_size
364
+
365
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
366
+ self.layers = nn.ModuleList(
367
+ [RuyiQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
368
+ )
369
+ self.eelayers = nn.ModuleList(
370
+ [RuyiQwen2DecoderLayer(config, layer_idx) for layer_idx in config.early_exit_points[:-1]]
371
+ )
372
+ self.norms = nn.ModuleList(
373
+ [RuyiQwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for _ in config.early_exit_points]
374
+ )
375
+ self.rotary_emb = RuyiQwen2RotaryEmbedding(config=config)
376
+ self.gradient_checkpointing = False
377
+
378
+ if config.default_early_exit_point not in config.early_exit_points:
379
+ config.default_early_exit_point = config.early_exit_points[-1]
380
+ set_global_val("early_exit_point", config.default_early_exit_point)
381
+
382
+ # Initialize weights and apply final processing
383
+ self.post_init()
384
+
385
+ def get_input_embeddings(self):
386
+ return self.embed_tokens
387
+
388
+ def set_input_embeddings(self, value):
389
+ self.embed_tokens = value
390
+
391
+ def save_pretrained(self, save_directory, **kwargs):
392
+ super().save_pretrained(save_directory, **kwargs)
393
+ shutil.copyfile(
394
+ os.path.abspath(__file__),
395
+ os.path.join(save_directory, "modeling_ruyi_qwen2.py")
396
+ )
397
+
398
+ @can_return_tuple
399
+ def forward(
400
+ self,
401
+ input_ids: Optional[torch.LongTensor] = None,
402
+ attention_mask: Optional[torch.Tensor] = None,
403
+ position_ids: Optional[torch.LongTensor] = None,
404
+ past_key_values: Optional[Cache] = None,
405
+ inputs_embeds: Optional[torch.FloatTensor] = None,
406
+ use_cache: Optional[bool] = None,
407
+ output_attentions: Optional[bool] = None,
408
+ output_hidden_states: Optional[bool] = None,
409
+ cache_position: Optional[torch.LongTensor] = None,
410
+ **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
411
+ ) -> BaseModelOutputWithPast:
412
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
413
+ output_hidden_states = (
414
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
415
+ )
416
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
417
+
418
+ if (input_ids is None) ^ (inputs_embeds is not None):
419
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
420
+
421
+ if self.gradient_checkpointing and self.training and use_cache:
422
+ logger.warning_once(
423
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
424
+ )
425
+ use_cache = False
426
+
427
+ # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
428
+ if not isinstance(past_key_values, (type(None), Cache)):
429
+ raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
430
+
431
+ if inputs_embeds is None:
432
+ inputs_embeds = self.embed_tokens(input_ids)
433
+
434
+ if use_cache and past_key_values is None:
435
+ past_key_values = DynamicCache()
436
+
437
+ if cache_position is None:
438
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
439
+ cache_position = torch.arange(
440
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
441
+ )
442
+
443
+ if position_ids is None:
444
+ position_ids = cache_position.unsqueeze(0)
445
+
446
+ causal_mask = self._update_causal_mask(
447
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
448
+ )
449
+
450
+ hidden_states = inputs_embeds
451
+
452
+ # create position embeddings to be shared across the decoder layers
453
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
454
+
455
+ # decoder layers
456
+ all_hidden_states = () if output_hidden_states else None
457
+ all_self_attns = () if output_attentions else None
458
+
459
+ early_exit_point = get_global_val("early_exit_point", self.config.early_exit_points[-1])
460
+ for decoder_layer in chain(
461
+ self.layers[ :early_exit_point],
462
+ [self.layers[-1] if early_exit_point == self.config.num_hidden_layers - 1 \
463
+ else self.eelayers[self.config.early_exit_points.index(early_exit_point)]]
464
+ ):
465
+ if output_hidden_states:
466
+ all_hidden_states += (hidden_states,)
467
+
468
+ if self.gradient_checkpointing and self.training:
469
+ layer_outputs = self._gradient_checkpointing_func(
470
+ partial(decoder_layer.__call__, **flash_attn_kwargs),
471
+ hidden_states,
472
+ causal_mask,
473
+ position_ids,
474
+ past_key_values,
475
+ output_attentions,
476
+ use_cache,
477
+ cache_position,
478
+ position_embeddings,
479
+ )
480
+ else:
481
+ layer_outputs = decoder_layer(
482
+ hidden_states,
483
+ attention_mask=causal_mask,
484
+ position_ids=position_ids,
485
+ past_key_value=past_key_values,
486
+ output_attentions=output_attentions,
487
+ use_cache=use_cache,
488
+ cache_position=cache_position,
489
+ position_embeddings=position_embeddings,
490
+ **flash_attn_kwargs,
491
+ )
492
+
493
+ if isinstance(layer_outputs, tuple):
494
+ hidden_states = layer_outputs[0]
495
+ else:
496
+ hidden_states = layer_outputs # deepspeed gradient checkpointing
497
+
498
+ if output_attentions:
499
+ all_self_attns += (layer_outputs[1],)
500
+
501
+ hidden_states = self.norms[self.config.early_exit_points.index(early_exit_point)](hidden_states)
502
+
503
+ # add hidden states from the last decoder layer
504
+ if output_hidden_states:
505
+ all_hidden_states += (hidden_states,)
506
+
507
+ return BaseModelOutputWithPast(
508
+ last_hidden_state=hidden_states,
509
+ past_key_values=past_key_values if use_cache else None,
510
+ hidden_states=all_hidden_states,
511
+ attentions=all_self_attns,
512
+ )
513
+
514
+ def _update_causal_mask(
515
+ self,
516
+ attention_mask: Union[torch.Tensor, "BlockMask"],
517
+ input_tensor: torch.Tensor,
518
+ cache_position: torch.Tensor,
519
+ past_key_values: Cache,
520
+ output_attentions: bool = False,
521
+ ):
522
+ if self.config._attn_implementation == "flash_attention_2":
523
+ if attention_mask is not None and past_key_values is not None:
524
+ is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
525
+ if is_padding_right:
526
+ raise ValueError(
527
+ "You are attempting to perform batched generation with padding_side='right'"
528
+ " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
529
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
530
+ )
531
+ if attention_mask is not None and 0.0 in attention_mask:
532
+ return attention_mask
533
+ return None
534
+ if self.config._attn_implementation == "flex_attention":
535
+ if isinstance(attention_mask, torch.Tensor):
536
+ attention_mask = make_flex_block_causal_mask(attention_mask)
537
+ return attention_mask
538
+
539
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
540
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
541
+ # to infer the attention mask.
542
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
543
+ using_static_cache = isinstance(past_key_values, StaticCache)
544
+ using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
545
+
546
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
547
+ if (
548
+ self.config._attn_implementation == "sdpa"
549
+ and not (using_static_cache or using_sliding_window_cache)
550
+ and not output_attentions
551
+ ):
552
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
553
+ attention_mask,
554
+ inputs_embeds=input_tensor,
555
+ past_key_values_length=past_seen_tokens,
556
+ sliding_window=self.config.sliding_window,
557
+ is_training=self.training,
558
+ ):
559
+ return None
560
+
561
+ dtype, device = input_tensor.dtype, input_tensor.device
562
+ min_dtype = torch.finfo(dtype).min
563
+ sequence_length = input_tensor.shape[1]
564
+ # SlidingWindowCache or StaticCache
565
+ if using_sliding_window_cache or using_static_cache:
566
+ target_length = past_key_values.get_max_cache_shape()
567
+ # DynamicCache or no cache
568
+ else:
569
+ target_length = (
570
+ attention_mask.shape[-1]
571
+ if isinstance(attention_mask, torch.Tensor)
572
+ else past_seen_tokens + sequence_length + 1
573
+ )
574
+
575
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
576
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
577
+ attention_mask,
578
+ sequence_length=sequence_length,
579
+ target_length=target_length,
580
+ dtype=dtype,
581
+ device=device,
582
+ cache_position=cache_position,
583
+ batch_size=input_tensor.shape[0],
584
+ config=self.config,
585
+ past_key_values=past_key_values,
586
+ )
587
+
588
+ if (
589
+ self.config._attn_implementation == "sdpa"
590
+ and attention_mask is not None
591
+ and attention_mask.device.type in ["cuda", "xpu", "npu"]
592
+ and not output_attentions
593
+ ):
594
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
595
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
596
+ # Details: https://github.com/pytorch/pytorch/issues/110213
597
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
598
+
599
+ return causal_mask
600
+
601
+ @staticmethod
602
+ def _prepare_4d_causal_attention_mask_with_cache_position(
603
+ attention_mask: torch.Tensor,
604
+ sequence_length: int,
605
+ target_length: int,
606
+ dtype: torch.dtype,
607
+ device: torch.device,
608
+ cache_position: torch.Tensor,
609
+ batch_size: int,
610
+ config: RuyiQwen2Config,
611
+ past_key_values: Cache,
612
+ ):
613
+ """
614
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
615
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
616
+
617
+ Args:
618
+ attention_mask (`torch.Tensor`):
619
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
620
+ sequence_length (`int`):
621
+ The sequence length being processed.
622
+ target_length (`int`):
623
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
624
+ dtype (`torch.dtype`):
625
+ The dtype to use for the 4D attention mask.
626
+ device (`torch.device`):
627
+ The device to place the 4D attention mask on.
628
+ cache_position (`torch.Tensor`):
629
+ Indices depicting the position of the input sequence tokens in the sequence.
630
+ batch_size (`torch.Tensor`):
631
+ Batch size.
632
+ config (`Qwen2Config`):
633
+ The model's configuration class
634
+ past_key_values (`Cache`):
635
+ The cache class that is being used currently to generate
636
+ """
637
+ if attention_mask is not None and attention_mask.dim() == 4:
638
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
639
+ causal_mask = attention_mask
640
+ else:
641
+ min_dtype = torch.finfo(dtype).min
642
+ causal_mask = torch.full(
643
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
644
+ )
645
+ diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
646
+ if config.get_text_config().sliding_window is not None:
647
+ # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
648
+ # the check is needed to verify is current checkpoint was trained with sliding window or not
649
+ if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
650
+ sliding_attend_mask = torch.arange(target_length, device=device) <= (
651
+ cache_position.reshape(-1, 1) - config.get_text_config().sliding_window
652
+ )
653
+ diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
654
+ causal_mask *= diagonal_attend_mask
655
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
656
+ if attention_mask is not None:
657
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
658
+ if attention_mask.shape[-1] > target_length:
659
+ attention_mask = attention_mask[:, :target_length]
660
+ mask_length = attention_mask.shape[-1]
661
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
662
+ causal_mask.device
663
+ )
664
+ padding_mask = padding_mask == 0
665
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
666
+ padding_mask, min_dtype
667
+ )
668
+ return causal_mask
669
+
670
+
671
+ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
672
+
673
+
674
+ class RuyiQwen2ForCausalLM(RuyiQwen2PreTrainedModel, GenerationMixin):
675
+ _tied_weights_keys = ["lm_head.weight"]
676
+ _tp_plan = {"lm_head": "colwise_rep"}
677
+ _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
678
+
679
+ def __init__(self, config):
680
+ super().__init__(config)
681
+ self.config = config
682
+ self.model = RuyiQwen2Model(config)
683
+ self.vocab_size = config.vocab_size
684
+ self.shared_heads = config.shared_heads
685
+ if self.shared_heads:
686
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
687
+ else:
688
+ self.lm_head = nn.ModuleList(
689
+ [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in config.early_exit_points]
690
+ )
691
+
692
+ # Initialize weights and apply final processing
693
+ self.post_init()
694
+
695
+ def get_input_embeddings(self):
696
+ return self.model.embed_tokens
697
+
698
+ def set_input_embeddings(self, value):
699
+ self.model.embed_tokens = value
700
+
701
+ def get_output_embeddings(self):
702
+ return self.lm_head
703
+
704
+ def set_output_embeddings(self, new_embeddings):
705
+ self.lm_head = new_embeddings
706
+
707
+ def set_decoder(self, decoder):
708
+ self.model = decoder
709
+
710
+ def get_decoder(self):
711
+ return self.model
712
+
713
+ def save_pretrained(self, save_directory, **kwargs):
714
+ super().save_pretrained(save_directory, **kwargs)
715
+ shutil.copyfile(
716
+ os.path.abspath(__file__),
717
+ os.path.join(save_directory, "modeling_ruyi_qwen2.py")
718
+ )
719
+
720
+ @can_return_tuple
721
+ def forward(
722
+ self,
723
+ input_ids: Optional[torch.LongTensor] = None,
724
+ attention_mask: Optional[torch.Tensor] = None,
725
+ position_ids: Optional[torch.LongTensor] = None,
726
+ past_key_values: Optional[Cache] = None,
727
+ inputs_embeds: Optional[torch.FloatTensor] = None,
728
+ labels: Optional[torch.LongTensor] = None,
729
+ use_cache: Optional[bool] = None,
730
+ output_attentions: Optional[bool] = None,
731
+ output_hidden_states: Optional[bool] = None,
732
+ cache_position: Optional[torch.LongTensor] = None,
733
+ logits_to_keep: Union[int, torch.Tensor] = 0,
734
+ **kwargs: Unpack[KwargsForCausalLM],
735
+ ) -> CausalLMOutputWithPast:
736
+
737
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
738
+ output_hidden_states = (
739
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
740
+ )
741
+
742
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
743
+ outputs: BaseModelOutputWithPast = self.model(
744
+ input_ids=input_ids,
745
+ attention_mask=attention_mask,
746
+ position_ids=position_ids,
747
+ past_key_values=past_key_values,
748
+ inputs_embeds=inputs_embeds,
749
+ use_cache=use_cache,
750
+ output_attentions=output_attentions,
751
+ output_hidden_states=output_hidden_states,
752
+ cache_position=cache_position,
753
+ **kwargs,
754
+ )
755
+
756
+ hidden_states = outputs.last_hidden_state
757
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
758
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
759
+ if self.shared_heads:
760
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
761
+ else:
762
+ early_exit_point = get_global_val("early_exit_point", self.config.early_exit_points[-1])
763
+ logits = self.lm_head[self.config.early_exit_points.index(early_exit_point)](hidden_states[:, slice_indices, :])
764
+
765
+ loss = None
766
+ if labels is not None:
767
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
768
+
769
+ return CausalLMOutputWithPast(
770
+ loss=loss,
771
+ logits=logits,
772
+ past_key_values=outputs.past_key_values,
773
+ hidden_states=outputs.hidden_states,
774
+ attentions=outputs.attentions,
775
+ )
776
+
777
+
778
+ __all__ = [
779
+ "RuyiQwen2PreTrainedModel",
780
+ "RuyiQwen2Model",
781
+ "RuyiQwen2ForCausalLM",
782
+ ]
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|endoftext|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff