TeleAI-AI-Flow commited on Jul 25, 2025

Commit

161e8e7

verified ·

1 Parent(s): 75ee892

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

.gitattributes +54 -35
.msc +0 -0
.mv +1 -0
README.md +295 -3
README_en.md +294 -0
added_tokens.json +24 -0
assets/AI-Flow-Ruyi-logo.png +3 -0
assets/ai-flow.png +3 -0
assets/logo.png +0 -0
assets/ruyi_model.png +0 -0
config.json +43 -0
configuration.json +1 -0
configuration_ruyi_qwen2.py +119 -0
generation_config.json +6 -0
merges.txt +0 -0
model-00001-of-00005.safetensors +3 -0
model-00002-of-00005.safetensors +3 -0
model-00003-of-00005.safetensors +3 -0
model-00004-of-00005.safetensors +3 -0
model-00005-of-00005.safetensors +3 -0
model.safetensors.index.json +402 -0
modeling_ruyi_qwen2.py +782 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +208 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,54 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*.tfevents* filter=lfs diff=lfs merge=lfs -text
+*.db* filter=lfs diff=lfs merge=lfs -text
+*.ark* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
+**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.gguf* filter=lfs diff=lfs merge=lfs -text
+*.ggml filter=lfs diff=lfs merge=lfs -text
+*.llamafile* filter=lfs diff=lfs merge=lfs -text
+*.pt2 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+model-00001-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00002-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00003-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00004-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00005-of-00005.safetensors filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -textassets/AI-Flow-Ruyi-logo.png filter=lfs diff=lfs merge=lfs -text
+assets/ai-flow.png filter=lfs diff=lfs merge=lfs -text

.msc ADDED Viewed

Binary file (1.77 kB). View file

.mv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Revision:master,CreatedAt:1753426552

README.md CHANGED Viewed

@@ -1,3 +1,295 @@
----
-license: apache-2.0
----

+---
+frameworks:
+- Pytorch
+license: apache-2.0
+tasks:
+- text-generation
+---
+# AI-Flow-Ruyi (如意大模型)
+<p align="center">
+    <img src="assets/AI-Flow-Ruyi-logo.png" width="500" />
+</p>
+<p align="center">
+        <a href="README.md">中文</a> &nbsp | &nbsp <a href="README_en.md">English</a>
+        <br>
+        🐱 <a href="https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://www.modelscope.cn/models/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704/">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑&nbsp <a href="https://www.arxiv.org/abs/2506.12479">Paper</a>
+</p>
+#### Long long ago...
+> 龙宫中珍藏着一根神棒，能大能小，变化无穷。一日，龙王闲来无事，对着神棒感慨：“你有如此神通，若能助我龙族做些别的事该多好。”话音未落，神棒竟开口应道：“我倒有个主意，这变化之能，若用来帮世人解决难题...” 说干就干，神棒瞬间摇身一变，化作一个神通广大的“如意”大模型，能依据问题的难易，自由伸缩其“能耐”。龙王见状大喜：“这不正是能助人排忧解难的‘如意’宝贝吗？”遂为其赐名“如意”，派它前往人间济世助人。
+## 新闻
+* 🎉🎉[2025/7/25]：如意-7B正式版（AI-Flow-Ruyi-7B）发布
+* 🎉🎉[2025/7/14]：智传网（AI Flow）被国内知名科技媒体「[机器之心](https://mp.weixin.qq.com/s/fiyb3LyJOd5mr9xzAsDZ4A)」报道！
+* 🎉🎉[2025/7/4]：智传网（AI Flow）被全球资讯机构[Omdia](https://omdia.tech.informa.com/om137892/on-the-radar-teleai-brings-intelligence-to-the-network-edge-through-ai-flow)纳入短评，列为生成式 AI 落地应用的“重点观察”。
+* 🎉🎉[2025/7/4]：如意-7B预览版（AI-Flow-Ruyi-7B-Preview）发布
+## 介绍
+**如意大模型（AI-Flow-Ruyi）** 是中国电信人工智能研究院 (TeleAI) 智传网（AI Flow）团队研发，是面向下一代“端-边-云”模型服务架构的**同源家族模型（Familial Model）** 。其核心在于大小模型共享同源参数，模型能基于早退出机制，根据问题复杂度调用不同参数规模的分支模型进行响应。各分支既可独立运行，又能依托同源特性实现信息共享与无缝切换，结合端-边-云分布式部署，完成家族大小模型协同，实现模型分布式推理效率大幅提升。
+![](assets/ai-flow.png)
+![](assets/ruyi_model.png)
+## 如意-7B
+为了让业界能亲身体验能够自由伸缩的“家族模型”，我们开源了如意-7B（AI-Flow-Ruyi-7B）模型，以展示我们在技术落地上的决心。如意-7B于7月25日发布。其最大参数量分支为7B，可分化出具有等效参数量为3B、4B、5B、6B的早退出分支。其中：
+* 3B、4B分支聚焦简单对话场景，其优势在于响应速度快、资源需求低；
+* 5B、6B分支则针对日常通用任务场景，在性能与响应速度之间寻求平衡；
+* 7B分支主要用于应对复杂问题，在多种能力维度上展现出较为全面的特性，但相对而言响应速度稍缓、资源需求略高。
+|位点序号|早退出位置|等效模型大小|对应分支代号|场景定位|
+|:-:|:-:|:-:|:-:|:-:|
+|1|11层|3B|AI-Flow-Ruyi-7B-E3B|简单对话|
+|2|15层|4B|AI-Flow-Ruyi-7B-E4B|简单对话|
+|3|19层|5B|AI-Flow-Ruyi-7B-E5B|日常任务|
+|4|23层|6B|AI-Flow-Ruyi-7B-E6B|日常任务|
+|5|27层|7B|AI-Flow-Ruyi-7B-E7B|复杂问题|
+### 训练过程
+在训练开始前，我们基于Qwen团队预训练的[Qwen2.5-7B](https://arxiv.org/abs/2412.15115)模型（其已在18万亿高质量token上完成预训练），对7B主分支进行了参数初始化；对于早退出分支，其解码器层采用早退出位置的下一层参数进行初始化。
+完成初始化后，我们采用**多分支联合预训练**方法，在私有高质量数据集上进行了约4000亿token的继续预训练，构建出如意-7B基座（AI-Flow-Ruyi-7B-Base）。
+随后，我们基于约70万条高质量指令数据，对各分支进行了**联合指令遵循微调**，得到如意-7B。
+### 性能评测
+我们基于[OpenCompass](https://github.com/open-compass/opencompass)及其官方配置文件，以0-shot方式在多个数据集上进行评测。
+<details>
+<summary>通用任务评测</summary>
+|模型名称|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|均分|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|74.78|66.02|76.33|60.68|63.39|66.11|56.25|66.22|
+|Llama3.1-8B-Instruct|53.16|45.36|51.65|72.47|83.73|71.37|58.54|62.33|
+|Qwen2.5-7B-Instruct|70.88|56.33|75.71|51.51|86.44|81.13|68.30|70.04|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
+</details>
+<details>
+<summary>代码任务评测</summary>
+|模型名称|HumanEval|MBPP|LiveCodeBench|均分|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|84.76|78.60|63.10|75.49|
+|Qwen2.5-7B-Instruct|63.41|68.48|8.15|46.68|
+|Llama3.1-8B-Instruct|84.15|70.82|34.55|63.17|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|76.83|77.04|28.44|60.77|
+</details>
+<details>
+<summary>STEM任务评测</summary>
+|模型名称|GPQA|Math|GSM-8K|均分|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|38.38|83.84|93.03|71.75|
+|Qwen2.5-7B-Instruct|25.25|49.22|85.82|53.43|
+|Llama3.1-8B-Instruct|35.35|73.66|88.48|65.83|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|30.30|72.18|91.36|64.61|
+</details>
+同时，各早退出分支性能呈现出随等效参数量单调递增的趋势。
+|模型名称|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|均分|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|AI-Flow-Ruyi-7B-E3B-0725<b>(ours)</b>|34.67|17.49|43.99|31.63|47.12|31.20|49.59|36.53|
+|AI-Flow-Ruyi-7B-E4B-0725<b>(ours)</b>|52.63|30.10|45.04|50.94|77.63|61.63|51.99|52.85|
+|AI-Flow-Ruyi-7B-E5B-0725<b>(ours)</b>|61.09|48.54|66.64|75.41|82.03|74.91|61.46|67.15|
+|AI-Flow-Ruyi-7B-E6B-0725<b>(ours)</b>|63.96|53.98|74.95|79.33|81.36|76.64|62.96|70.45|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
+<details>
+<summary>[历史]如意-7B预览版</summary>
+## 如意-7B预览版
+为了让业界能亲身体验能够自由伸缩的“家族模型”，我们开源了如意-7B预览版（AI-Flow-Ruyi-7B-Preview），以展示我们在技术落地上的决心。如意-7B预览版（AI-Flow-Ruyi-7B-Preview）于7月4日发布。其最大参数量分支为7B，可分化出具有等效参数量为3B、4B、5B、6B的早退出分支。其中：
+* 3B、4B分支聚焦简单对话场景，其优势在于响应速度快、资源需求低；
+* 5B、6B分支则针对日常通用任务场景，在性能与响应速度之间寻求平衡；
+* 7B分支主要用于应对复杂问题，在多种能力维度上展现出较为全面的特性，但相对而言响应速度稍缓、资源需求略高。
+|位点序号|早退出位置|等效模型大小|对应分支代号|场景定位|
+|:-:|:-:|:-:|:-:|:-:|
+|1|11层|3B|AI-Flow-Ruyi-7B-E3B|简单对话|
+|2|15层|4B|AI-Flow-Ruyi-7B-E4B|简单对话|
+|3|19层|5B|AI-Flow-Ruyi-7B-E5B|日常任务|
+|4|23层|6B|AI-Flow-Ruyi-7B-E6B|日常任务|
+|5|27层|7B|AI-Flow-Ruyi-7B-E7B|复杂问题|
+### 训练过程
+在训练开始前，我们基于Qwen团队预训练的[Qwen2.5-7B](https://arxiv.org/abs/2412.15115)模型（其已在18万亿高质量token上完成预训练），对7B主分支进行了参数初始化；对于早退出分支，其解码器层采用早退出位置的下一层参数进行初始化。
+完成初始化后，我们采用**多分支联合预训练**方法，在私有高质量数据集上进行了约4000亿token的继续预训练，构建出如意-7B基座（AI-Flow-Ruyi-7B-Base）。
+随后，我们基于约120万条高质量指令数据，对各分支进行了**联合指令遵循微调**，得到如意-7B预览版。
+### 性能评测
+我们基于[OpenCompass](https://github.com/open-compass/opencompass)及其官方配置文件，以0-shot方式在多个数据集上进行评测。评测结果表明，7B主分支在通用任务性能上与Qwen2.5-7B-Instruct基本持平。
+<details>
+<summary>通用任务评测</summary>
+|模型名称|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|均分|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|74.78|66.02|76.33|63.39|60.68|68.24|
+|Qwen2.5-7B-Instruct|70.88|56.33|75.71|86.44|51.51|68.17|
+|Llama-3.1-8B-Instruct|53.16|45.36|51.65|83.73|72.47|61.27|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
+</details>
+<details>
+<summary>代码任务评测</summary>
+|模型名称|MBPP|HumanEval|LiveCodeBench|均分|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|78.60|84.76|63.10|75.49|
+|Qwen2.5-7B-Instruct|70.82|84.15|34.55|63.17|
+|Llama3.1-8B-Instruct|68.48|63.41|8.15|46.68|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|66.93|64.63|30.01|53.86|
+</details>
+<details>
+<summary>STEM任务评测</summary>
+|模型名称|Math|GPQA|GSM-8K|均分|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|83.84|38.38|93.03|71.75|
+|Qwen2.5-7B-Instruct|73.66|35.35|88.48|65.83|
+|Llama3.1-8B-Instruct|49.22|25.25|85.82|53.43|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|44.94|24.75|81.65|50.45|
+</details>
+同时，各早退出分支性能呈现出随等效参数量单调递增的趋势。
+|模型名称|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|均分|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|AI-Flow-Ruyi-7B-E3B<b>(ours)</b>|66.93|44.70|19.80|40.00|32.29|40.74|
+|AI-Flow-Ruyi-7B-E4B<b>(ours)</b>|78.86|48.60|26.51|58.98|41.98|50.99|
+|AI-Flow-Ruyi-7B-E5B<b>(ours)</b>|75.34|49.13|33.91|65.76|64.48|57.72|
+|AI-Flow-Ruyi-7B-E6B<b>(ours)</b>|84.58|53.06|33.94|73.22|47.33|58.43|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
+</details>
+## 使用
+Step 1. 创建并激活虚拟环境
+```sh
+conda create -n ruyi python=3.12
+conda activate ruyi
+```
+Step 2. 克隆本仓库至本地
+```sh
+git clone https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi.git
+cd AI-Flow-Ruyi
+```
+Step 3. 由源码安装（PS: flash_attn编译安装较慢，建议移步[官方仓库](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1)下载whl手动安装）
+```sh
+pip install -e .
+```
+Step 4. 下载模型权重
+```sh
+git clone https://www.modelscope.cn/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-0725.git models/AI-Flow-Ruyi-7B-0725
+```
+Step 5. 运行Demo
+```sh
+python demo.py
+```
+<details>
+<summary>查看Demo代码</summary>
+```py
+import torch
+from ruyi.global_var import set_global_val
+from transformers import GenerationConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_path = f"models/AI-Flow-Ruyi-7B-0725"
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16).to('cuda')
+generation_config = GenerationConfig(
+    do_sample=True,
+    top_k=30,
+    top_p=0.95,
+    temperature=0.6,
+    repetition_penalty=1.2,
+    no_repeat_ngram_size=3,
+    max_new_tokens=8192
+)
+# 输入文本
+messages = [
+    {"role": "user", "content": "介绍一下你自己。"},
+]
+# 应用 chat_template 模板
+prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer(prompt, return_tensors="pt")
+# 模型生成
+with torch.no_grad():
+    # 设置早退出点
+    # - 11: 第一个早退出点，对应约3B
+    # - 15: 第二个早退出点，对应约4B
+    # - 19: 第三个早退出点，对应约5B
+    # - 23: 第四个早退出点，对应约6B
+    # - 27: 第五个早退出点，对应约7B
+    set_global_val("early_exit_point", 11)
+    output = model.generate(
+        inputs["input_ids"].to('cuda'),
+        generation_config=generation_config
+    )
+# 解码并打印结果
+generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
+print(generated_text)
+```
+</details>
+## 引用
+```bibtex
+@misc{an2025aiflowperspectivesscenarios,
+      title={AI Flow: Perspectives, Scenarios, and Approaches},
+      author={Hongjun An and Wenhan Hu and Sida Huang and Siqi Huang and Ruanjun Li and Yuanzhi Liang and Jiawei Shao and Yiliang Song and Zihan Wang and Cheng Yuan and Chi Zhang and Hongyuan Zhang and Wenhao Zhuang and Xuelong Li},
+      year={2025},
+      eprint={2506.12479},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2506.12479},
+}
+```

README_en.md ADDED Viewed

	@@ -0,0 +1,294 @@

+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tasks:
+- text-generation
+---
+# AI-Flow-Ruyi (如意大模型)
+<p align="center">
+    <img src="assets/AI-Flow-Ruyi-logo.png" width="500" />
+</p>
+<p align="center">
+        <a href="README.md">中文</a> &nbsp | &nbsp <a href="README_en.md">English</a>
+        <br>
+        🐱 <a href="https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi">GitHub</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704">Hugging Face</a>&nbsp&nbsp | &nbsp&nbsp🤖 <a href="https://www.modelscope.cn/models/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-Preview0704/">ModelScope</a>&nbsp&nbsp | &nbsp&nbsp 📑&nbsp <a href="https://www.arxiv.org/abs/2506.12479">Paper</a>
+</p>
+## News
+* 🎉🎉[2025/7/25]：AI-Flow-Ruyi-7B released!
+* 🎉🎉[2025/7/4]：TeleAI’s AI Flow is now on the radar of global analyst firm [Omdia](https://omdia.tech.informa.com/om137892/on-the-radar-teleai-brings-intelligence-to-the-network-edge-through-ai-flow) as a generative-AI solution to watch.
+* 🎉🎉[2025/7/4]：AI-Flow-Ruyi-7B-Preview released!
+## Introduction
+**AI-Flow-Ruyi** is a **Familial Model** developed by the AI Flow team  of the Institute of Artificial Intelligence (TeleAI), China Telecom. Designed for next-generation "Device-Edge-Cloud" model service architectures, its core innovation lies in **shared familial parameters** across large and small models. Leveraging an **early-exit mechanism**, the system dynamically routes queries to branch models of appropriate parameter sizes based on problem complexity. These branches operate independently while enabling **information sharing** and **seamless transitions** through their shared features. Combined with distributed Device-Edge-Cloud deployment, this facilitates **collaborative inference** within the model family, significantly enhancing distributed reasoning efficiency.
+![](assets/ai-flow.png)
+![](assets/ruyi_model.png)
+## AI-Flow-Ruyi-7B
+To give the community a hands-on experience with a truly elastic “family of models,” we are open-sourcing the Ruyi-7B (AI-Flow-Ruyi-7B), released on 25 July. Its largest branch contains 7 billion parameters and can spawn early-exit sub-networks with effective parameter counts of 3 B, 4 B, 5 B, and 6 B:
+Key branch specializations:
+* **3B/4B branches**: Optimized for simple dialogue scenarios, delivering **faster response times** with **minimal resource consumption**
+* **5B/6B branches**: Targeting daily general-purpose tasks, **striking a balance** between capability and responsiveness
+* **7B branch**: Designed for complex problem-solving, **exhibiting more well-rounded capabilities** across multiple dimensions – though with **moderately slower inference speeds** and **higher resource demands**
+|Position No.|Early-Exit Layer|Equivalent Model Size|Branch Designation|Target Scenario|
+|:-:|:-:|:-:|:-:|:-:|
+|1|Layer 11|3B|AI-Flow-Ruyi-7B-E3B|Simple dialogue|
+|2|Layer 15|4B|AI-Flow-Ruyi-7B-E4B|Simple dialogue|
+|3|Layer 19|5B|AI-Flow-Ruyi-7B-E5B|Daily tasks|
+|4|Layer 23|6B|AI-Flow-Ruyi-7B-E6B|Daily tasks|
+|5|Layer 27|7B|AI-Flow-Ruyi-7B-E7B|Complex problems|
+### Training process
+Prior to training initiation, we initialized parameters for the 7B main branch using Qwen team's pre-trained [Qwen2.5-7B](https://arxiv.org/abs/2412.15115) (pre-trained on 18 trillion high-quality tokens). For early-exit branches, decoder layers were initialized with parameters from the subsequent layer of their respective early-exit positions.
+Following initialization, we conducted **multi-branch joint pre-training** with approximately 400 billion tokens on proprietary high-quality datasets, resulting in the AI-Flow-Ruyi-7B-Base foundation model.
+Subsequently, we performed **multi-branch joint instruction-following fine-tuning** across all branches using ~0.7 million high-quality instruction samples, yielding the AI-Flow-Ruyi-7B.
+### Performance review
+We conduct a review based on [OpenCompass](https://github.com/open-compass/opencompass) and its official configuration files on multiple datasets in a 0-shot manner.
+<details>
+<summary>Common tasks review</summary>
+|Model|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|Mean|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|74.78|66.02|76.33|60.68|63.39|66.11|56.25|66.22|
+|Llama3.1-8B-Instruct|53.16|45.36|51.65|72.47|83.73|71.37|58.54|62.33|
+|Qwen2.5-7B-Instruct|70.88|56.33|75.71|51.51|86.44|81.13|68.30|70.04|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
+</details>
+<details>
+<summary>Code tasks review</summary>
+|Model|HumanEval|MBPP|LiveCodeBench|Mean|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|84.76|78.60|63.10|75.49|
+|Qwen2.5-7B-Instruct|63.41|68.48|8.15|46.68|
+|Llama3.1-8B-Instruct|84.15|70.82|34.55|63.17|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|76.83|77.04|28.44|60.77|
+</details>
+<details>
+<summary>STEM tasks review</summary>
+|Model|GPQA|Math|GSM-8K|Mean|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|38.38|83.84|93.03|71.75|
+|Qwen2.5-7B-Instruct|25.25|49.22|85.82|53.43|
+|Llama3.1-8B-Instruct|35.35|73.66|88.48|65.83|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|30.30|72.18|91.36|64.61|
+</details>
+At the same time, the performance of each early exit branch shows a monotonically increasing trend with the number of equivalent parameters.
+|Model|MMLU|MMLU-Pro|CMMLU|BBH|ARC-c|HellaSwag|Winogrand|Mean|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|AI-Flow-Ruyi-7B-E3B-0725<b>(ours)</b>|34.67|17.49|43.99|31.63|47.12|31.20|49.59|36.53|
+|AI-Flow-Ruyi-7B-E4B-0725<b>(ours)</b>|52.63|30.10|45.04|50.94|77.63|61.63|51.99|52.85|
+|AI-Flow-Ruyi-7B-E5B-0725<b>(ours)</b>|61.09|48.54|66.64|75.41|82.03|74.91|61.46|67.15|
+|AI-Flow-Ruyi-7B-E6B-0725<b>(ours)</b>|63.96|53.98|74.95|79.33|81.36|76.64|62.96|70.45|
+|AI-Flow-Ruyi-7B-E7B-0725<b>(ours)</b>|64.78|56.39|76.17|81.37|82.71|76.69|63.22|71.62|
+<details>
+<summary>[History]AI-Flow-Ruyi-7B-Preview</summary>
+## AI-Flow-Ruyi-7B-Preview
+To give the community a hands-on experience with a truly elastic “family of models,” we are open-sourcing the Ruyi-7B Preview (AI-Flow-Ruyi-7B-Preview), released on 4 July. Its largest branch contains 7 billion parameters and can spawn early-exit sub-networks with effective parameter counts of 3 B, 4 B, 5 B, and 6 B:
+Key branch specializations:
+* **3B/4B branches**: Optimized for simple dialogue scenarios, delivering **faster response times** with **minimal resource consumption**
+* **5B/6B branches**: Targeting daily general-purpose tasks, **striking a balance** between capability and responsiveness
+* **7B branch**: Designed for complex problem-solving, **exhibiting more well-rounded capabilities** across multiple dimensions – though with **moderately slower inference speeds** and **higher resource demands**
+|Position No.|Early-Exit Layer|Equivalent Model Size|Branch Designation|Target Scenario|
+|:-:|:-:|:-:|:-:|:-:|
+|1|Layer 11|3B|AI-Flow-Ruyi-7B-E3B|Simple dialogue|
+|2|Layer 15|4B|AI-Flow-Ruyi-7B-E4B|Simple dialogue|
+|3|Layer 19|5B|AI-Flow-Ruyi-7B-E5B|Daily tasks|
+|4|Layer 23|6B|AI-Flow-Ruyi-7B-E6B|Daily tasks|
+|5|Layer 27|7B|AI-Flow-Ruyi-7B-E7B|Complex problems|
+### Training process
+Prior to training initiation, we initialized parameters for the 7B main branch using Qwen team's pre-trained [Qwen2.5-7B](https://arxiv.org/abs/2412.15115) (pre-trained on 18 trillion high-quality tokens). For early-exit branches, decoder layers were initialized with parameters from the subsequent layer of their respective early-exit positions.
+Following initialization, we conducted **multi-branch joint pre-training** with approximately 400 billion tokens on proprietary high-quality datasets, resulting in the AI-Flow-Ruyi-7B-Base foundation model.
+Subsequently, we performed **multi-branch joint instruction-following fine-tuning** across all branches using ~1.2 million high-quality instruction samples, yielding the AI-Flow-Ruyi-7B-Preview.
+### Performance review
+We conduct a review based on [OpenCompass](https://github.com/open-compass/opencompass) and its official configuration files on multiple datasets in a 0-shot manner. The evaluation results show that the 7B master branch is basically equal to Qwen2.5-7B-Instruct in terms of general-purpose task performance.
+<details>
+<summary>Common tasks review</summary>
+|Model|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|Mean|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|74.78|66.02|76.33|63.39|60.68|68.24|
+|Qwen2.5-7B-Instruct|70.88|56.33|75.71|86.44|51.51|68.17|
+|Llama-3.1-8B-Instruct|53.16|45.36|51.65|83.73|72.47|61.27|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
+</details>
+<details>
+<summary>Code tasks review</summary>
+|Model|MBPP|HumanEval|LiveCodeBench|Mean|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|78.60|84.76|63.10|75.49|
+|Qwen2.5-7B-Instruct|70.82|84.15|34.55|63.17|
+|Llama3.1-8B-Instruct|68.48|63.41|8.15|46.68|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|66.93|64.63|30.01|53.86|
+</details>
+<details>
+<summary>STEM tasks review</summary>
+|Model|Math|GPQA|GSM-8K|Mean|
+|:-:|:-:|:-:|:-:|:-:|
+|Qwen3-8B(think)|83.84|38.38|93.03|71.75|
+|Qwen2.5-7B-Instruct|73.66|35.35|88.48|65.83|
+|Llama3.1-8B-Instruct|49.22|25.25|85.82|53.43|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|44.94|24.75|81.65|50.45|
+</details>
+At the same time, the performance of each early exit branch shows a monotonically increasing trend with the number of equivalent parameters.
+|Model|MMLU|MMLU-Pro|CMMLU|ARC-c|BBH|Mean|
+|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
+|AI-Flow-Ruyi-7B-E3B<b>(ours)</b>|66.93|44.70|19.80|40.00|32.29|40.74|
+|AI-Flow-Ruyi-7B-E4B<b>(ours)</b>|78.86|48.60|26.51|58.98|41.98|50.99|
+|AI-Flow-Ruyi-7B-E5B<b>(ours)</b>|75.34|49.13|33.91|65.76|64.48|57.72|
+|AI-Flow-Ruyi-7B-E6B<b>(ours)</b>|84.58|53.06|33.94|73.22|47.33|58.43|
+|AI-Flow-Ruyi-7B-E7B<b>(ours)</b>|87.19|59.78|48.14|69.83|74.47|67.88|
+</details>
+## Usage
+Step 1. Create and activate a virtual environment
+```sh
+conda create -n ruyi python=3.12
+conda activate ruyi
+```
+Step 2. Clone this warehouse to local
+```sh
+git clone https://github.com/TeleAI-AI-Flow/AI-Flow-Ruyi.git
+cd AI-Flow-Ruyi
+```
+Step 3. Installation from source (PS: flash_attn compilation and installation is slow, it is recommended to move to the [official repository](https://github.com/Dao-AILab/flash-attention/releases/tag/v2.7.4.post1) to download whl manual installation)
+```sh
+pip install -e .
+```
+Step 4. Download model weights
+```sh
+git clone https://www.modelscope.cn/TeleAI-AI-Flow/AI-Flow-Ruyi-7B-0725.git models/AI-Flow-Ruyi-7B-0725
+```
+Step 5. Run Demo
+```sh
+python demo.py
+```
+<details>
+<summary>View demo code</summary>
+```py
+import torch
+from ruyi.global_var import set_global_val
+from transformers import GenerationConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_path = f"models/AI-Flow-Ruyi-7B-0725"
+tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, attn_implementation='flash_attention_2', torch_dtype=torch.bfloat16).to('cuda')
+generation_config = GenerationConfig(
+    do_sample=True,
+    top_k=30,
+    top_p=0.95,
+    temperature=0.6,
+    repetition_penalty=1.2,
+    no_repeat_ngram_size=3,
+    max_new_tokens=8192
+)
+# input text
+messages = [
+    {"role": "user", "content": "Introduce yourself."},
+]
+# Apply chat_template template
+prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = tokenizer(prompt, return_tensors="pt")
+# Model Generation
+with torch.no_grad():
+    # Setting the early exit point
+    # - 11: First early exit point corresponding to about 3B.
+    # - 15: second early exit point, corresponding to approximately 4B.
+    # - 19: third early exit point, corresponding to about 5B.
+    # - 23: fourth early exit point, corresponding to approximately 6B.
+    # - 27: fifth early exit point, corresponding to about 7B.
+    set_global_val("early_exit_point", 11)
+    output = model.generate(
+        inputs["input_ids"].to('cuda'),
+        generation_config=generation_config
+    )
+# Decode and print results
+generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
+print(generated_text)
+```
+</details>
+## Citation
+```bibtex
+@misc{an2025aiflowperspectivesscenarios,
+      title={AI Flow: Perspectives, Scenarios, and Approaches},
+      author={Hongjun An and Wenhan Hu and Sida Huang and Siqi Huang and Ruanjun Li and Yuanzhi Liang and Jiawei Shao and Yiliang Song and Zihan Wang and Cheng Yuan and Chi Zhang and Hongyuan Zhang and Wenhao Zhuang and Xuelong Li},
+      year={2025},
+      eprint={2506.12479},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2506.12479},
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

assets/AI-Flow-Ruyi-logo.png ADDED Viewed

Git LFS Details

SHA256: be263e5ed0f2147f0e7e44f06570616ecb7b16c6d72b9a944d0f7f7a3280503f
Pointer size: 131 Bytes
Size of remote file: 314 kB

assets/ai-flow.png ADDED Viewed

Git LFS Details

SHA256: 10e2779ae99bc1c5430a8453462c2263ccf15e27a01e7df58a22c67309ce3fed
Pointer size: 131 Bytes
Size of remote file: 178 kB

assets/logo.png ADDED Viewed

assets/ruyi_model.png ADDED Viewed

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "RuyiQwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_ruyi_qwen2.RuyiQwen2Config",
+    "AutoModel": "modeling_ruyi_qwen2.RuyiQwen2Model",
+    "AutoModelForCausalLM": "modeling_ruyi_qwen2.RuyiQwen2ForCausalLM"
+  },
+  "bos_token_id": 151643,
+  "default_early_exit_point": 29,
+  "early_exit_points": [
+    11,
+    15,
+    19,
+    23,
+    27
+  ],
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 28,
+  "model_type": "ruyi_qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "shared_heads": false,
+  "sliding_window": 131072,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_mrope": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework":"Pytorch","task":"text-generation"}

configuration_ruyi_qwen2.py ADDED Viewed

	@@ -0,0 +1,119 @@

+#!/usr/bin/env python
+# Ref: https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2/configuration_qwen2.py
+# Copyright (c) Institute of Artificial Intelligence (TeleAI), China Telecom, 2025. All Rights Reserved.
+"""RuyiQwen2 model configuration"""
+import os
+import shutil
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class RuyiQwen2Config(PretrainedConfig):
+    model_type = "ruyi_qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `RuyiQwen2`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+        "eelayers.*.self_attn.q_proj": "colwise",
+        "eelayers.*.self_attn_k_proj": "colwise",
+        "eelayers.*.self_attn_v_proj": "colwise",
+        "eelayers.*.self_attn_o_proj": "rowwise",
+        "eelayers.*.mlp.gate_proj": "colwise",
+        "eelayers.*.mlp.up_proj": "colwise",
+        "eelayers.*.mlp.down_proj": "rowwise"
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "eelayers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        shared_heads=False,
+        default_early_exit_point=-1, # [0, num_hidden_layers-1], -1 = num_hidden_layers - 1
+        early_exit_points=list(range(1, 32, 2)),
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window  # we check `use_sliding_window` in the modeling code
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.shared_heads = shared_heads
+        self.default_early_exit_point = default_early_exit_point
+        self.early_exit_points = early_exit_points
+        self.auto_map = {
+            "AutoConfig": "configuration_ruyi_qwen2.RuyiQwen2Config",
+            "AutoModel": "modeling_ruyi_qwen2.RuyiQwen2Model",
+            "AutoModelForCausalLM": "modeling_ruyi_qwen2.RuyiQwen2ForCausalLM"
+        }
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def save_pretrained(self, save_directory, **kwargs):
+        super().save_pretrained(save_directory, **kwargs)
+        shutil.copyfile(
+            os.path.abspath(__file__),
+            os.path.join(save_directory, "configuration_ruyi_qwen2.py")
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "transformers_version": "4.51.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f291d6752ea2d02d22d079e6d418f15981ab41ab03793295c94701307a55b201
+size 4877660776

model-00002-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3895ef7a5cff5133a1718322b65afec36f295e4c33d1714c9f205b1817b432
+size 4932751008

model-00003-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:930933e0a2e620b43396519a76252c82efd2b973df5a41b4f33481e290af15a7
+size 4991495896

model-00004-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8234b0394a3d20b8a059452190346398c3703e50cafc51cd20e80bc3079b9f0
+size 4473850832

model-00005-of-00005.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a21aed606e27819560aae39827a08ac637b40a29e84d5d2bbea38465cf67309
+size 2179989736

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,402 @@

+{
+  "metadata": {
+    "total_size": 21455703040
+  },
+  "weight_map": {
+    "lm_head.0.weight": "model-00004-of-00005.safetensors",
+    "lm_head.1.weight": "model-00004-of-00005.safetensors",
+    "lm_head.2.weight": "model-00004-of-00005.safetensors",
+    "lm_head.3.weight": "model-00005-of-00005.safetensors",
+    "lm_head.4.weight": "model-00005-of-00005.safetensors",
+    "model.eelayers.0.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.eelayers.0.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.1.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.1.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.1.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.1.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.eelayers.1.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.eelayers.2.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.k_proj.bias": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.q_proj.bias": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.v_proj.bias": "model-00004-of-00005.safetensors",
+    "model.eelayers.2.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.input_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.mlp.down_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.mlp.gate_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.mlp.up_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.post_attention_layernorm.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.k_proj.bias": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.k_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.o_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.q_proj.bias": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.q_proj.weight": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.v_proj.bias": "model-00004-of-00005.safetensors",
+    "model.eelayers.3.self_attn.v_proj.weight": "model-00004-of-00005.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00005.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00005.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00005.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00005.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00005.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00005.safetensors",
+    "model.norms.0.weight": "model-00004-of-00005.safetensors",
+    "model.norms.1.weight": "model-00004-of-00005.safetensors",
+    "model.norms.2.weight": "model-00004-of-00005.safetensors",
+    "model.norms.3.weight": "model-00004-of-00005.safetensors",
+    "model.norms.4.weight": "model-00004-of-00005.safetensors"
+  }
+}

modeling_ruyi_qwen2.py ADDED Viewed

	@@ -0,0 +1,782 @@

+#!/usr/bin/env python
+# Ref: https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2/modeling_qwen2.py
+# Copyright (c) Institute of Artificial Intelligence (TeleAI), China Telecom, 2025. All Rights Reserved.
+"""RuyiQwen2 model"""
+import os
+import shutil
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+from itertools import chain
+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, SlidingWindowCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    LossKwargs,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+)
+from .configuration_ruyi_qwen2 import RuyiQwen2Config
+from ruyi.global_var import set_global_val, get_global_val
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+    from transformers.integrations.flex_attention import make_flex_block_causal_mask
+logger = logging.get_logger(__name__)
+class RuyiQwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class RuyiQwen2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: RuyiQwen2Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        sliding_window = None
+        if (
+            self.config.use_sliding_window
+            and getattr(self.config, "sliding_window", None) is not None
+            and self.layer_idx >= self.config.max_window_layers
+        ):
+            sliding_window = self.config.sliding_window
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
+                    'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=sliding_window,  # main diff with Llama
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+@use_kernel_forward_from_hub("RMSNorm")
+class RuyiQwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        RuyiQwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class RuyiQwen2DecoderLayer(nn.Module):
+    def __init__(self, config: RuyiQwen2Config, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.self_attn = RuyiQwen2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = RuyiQwen2MLP(config)
+        self.input_layernorm = RuyiQwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RuyiQwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        if config.sliding_window and config._attn_implementation != "flash_attention_2":
+            logger.warning_once(
+                f"Sliding Window Attention is enabled but not implemented for `{config._attn_implementation}`; "
+                "unexpected results may be encountered."
+            )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+class RuyiQwen2PreTrainedModel(PreTrainedModel):
+    config_class = RuyiQwen2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RuyiQwen2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, RuyiQwen2RMSNorm):
+            module.weight.data.fill_(1.0)
+class RuyiQwen2RotaryEmbedding(nn.Module):
+    def __init__(self, config: RuyiQwen2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class RuyiQwen2Model(RuyiQwen2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`RuyiQwen2DecoderLayer`]
+    Args:
+        config: RuyiQwen2Config
+    """
+    def __init__(self, config: RuyiQwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [RuyiQwen2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.eelayers = nn.ModuleList(
+            [RuyiQwen2DecoderLayer(config, layer_idx) for layer_idx in config.early_exit_points[:-1]]
+        )
+        self.norms = nn.ModuleList(
+            [RuyiQwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) for _ in config.early_exit_points]
+        )
+        self.rotary_emb = RuyiQwen2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        if config.default_early_exit_point not in config.early_exit_points:
+            config.default_early_exit_point = config.early_exit_points[-1]
+        set_global_val("early_exit_point", config.default_early_exit_point)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def save_pretrained(self, save_directory, **kwargs):
+        super().save_pretrained(save_directory, **kwargs)
+        shutil.copyfile(
+            os.path.abspath(__file__),
+            os.path.join(save_directory, "modeling_ruyi_qwen2.py")
+        )
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        early_exit_point = get_global_val("early_exit_point", self.config.early_exit_points[-1])
+        for decoder_layer in chain(
+            self.layers[ :early_exit_point],
+            [self.layers[-1] if early_exit_point == self.config.num_hidden_layers - 1 \
+                             else self.eelayers[self.config.early_exit_points.index(early_exit_point)]]
+        ):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    partial(decoder_layer.__call__, **flash_attn_kwargs),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **flash_attn_kwargs,
+                )
+            if isinstance(layer_outputs, tuple):
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs   # deepspeed gradient checkpointing
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norms[self.config.early_exit_points.index(early_exit_point)](hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and past_key_values is not None:
+                is_padding_right = attention_mask[:, -1].sum().item() != input_tensor.size()[0]
+                if is_padding_right:
+                    raise ValueError(
+                        "You are attempting to perform batched generation with padding_side='right'"
+                        " this may lead to unexpected behaviour for Flash Attention version of Qwen2. Make sure to "
+                        " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                    )
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        using_sliding_window_cache = isinstance(past_key_values, SlidingWindowCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if (
+            self.config._attn_implementation == "sdpa"
+            and not (using_static_cache or using_sliding_window_cache)
+            and not output_attentions
+        ):
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                sliding_window=self.config.sliding_window,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        # SlidingWindowCache or StaticCache
+        if using_sliding_window_cache or using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        # DynamicCache or no cache
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+            config=self.config,
+            past_key_values=past_key_values,
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        config: RuyiQwen2Config,
+        past_key_values: Cache,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+            config (`Qwen2Config`):
+                The model's configuration class
+            past_key_values (`Cache`):
+                The cache class that is being used currently to generate
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            diagonal_attend_mask = torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            if config.get_text_config().sliding_window is not None:
+                # if we have sliding window, we should not attend to tokens beyond sliding window length, so we mask them out also
+                # the check is needed to verify is current checkpoint was trained with sliding window or not
+                if not isinstance(past_key_values, SlidingWindowCache) or sequence_length > target_length:
+                    sliding_attend_mask = torch.arange(target_length, device=device) <= (
+                        cache_position.reshape(-1, 1) - config.get_text_config().sliding_window
+                    )
+                    diagonal_attend_mask.bitwise_or_(sliding_attend_mask)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...
+class RuyiQwen2ForCausalLM(RuyiQwen2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = RuyiQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.shared_heads = config.shared_heads
+        if self.shared_heads:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        else:
+            self.lm_head = nn.ModuleList(
+                [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in config.early_exit_points]
+            )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def save_pretrained(self, save_directory, **kwargs):
+        super().save_pretrained(save_directory, **kwargs)
+        shutil.copyfile(
+            os.path.abspath(__file__),
+            os.path.join(save_directory, "modeling_ruyi_qwen2.py")
+        )
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        if self.shared_heads:
+            logits = self.lm_head(hidden_states[:, slice_indices, :])
+        else:
+            early_exit_point = get_global_val("early_exit_point", self.config.early_exit_points[-1])
+            logits = self.lm_head[self.config.early_exit_points.index(early_exit_point)](hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "RuyiQwen2PreTrainedModel",
+    "RuyiQwen2Model",
+    "RuyiQwen2ForCausalLM",
+]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff