add model
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Llama-Chinese +0 -1
- Llama-Chinese/.gitattributes +1 -0
- Llama-Chinese/.vscode/launch.json +15 -0
- Llama-Chinese/README.md +815 -0
- Llama-Chinese/README_EN.md +736 -0
- Llama-Chinese/assets/base_eval.png +0 -0
- Llama-Chinese/assets/ceval.jpg +0 -0
- Llama-Chinese/assets/llama.jpg +0 -0
- Llama-Chinese/assets/llama.png +0 -0
- Llama-Chinese/assets/llama2-chinese-webui.jpg +0 -0
- Llama-Chinese/assets/llama3_eval.png +0 -0
- Llama-Chinese/assets/llama_eval.jpeg +0 -0
- Llama-Chinese/assets/meta_eval_13B.md +0 -0
- Llama-Chinese/assets/meta_eval_7B.md +0 -0
- Llama-Chinese/assets/tuned_eval.png +0 -0
- Llama-Chinese/assets/wechat.jpeg +0 -0
- Llama-Chinese/data/dev_sft.csv +0 -0
- Llama-Chinese/data/dev_sft_sharegpt.csv +0 -0
- Llama-Chinese/data/train_sft.csv +0 -0
- Llama-Chinese/docker/Dockerfile +27 -0
- Llama-Chinese/docker/Dockerfile_train +9 -0
- Llama-Chinese/docker/docker-compose.yml +16 -0
- Llama-Chinese/docs/chat_gradio_guide.md +26 -0
- Llama-Chinese/docs/inference_speed_guide.md +21 -0
- Llama-Chinese/examples/chat_gradio.py +99 -0
- Llama-Chinese/examples/chat_gradio_no_merge.py +105 -0
- Llama-Chinese/examples/llama2_for_langchain.py +52 -0
- Llama-Chinese/finetune_test.py +32 -0
- Llama-Chinese/important-change.txt +2 -0
- Llama-Chinese/inference-speed/CPU/ggml/README.md +66 -0
- Llama-Chinese/inference-speed/GPU/FasterTransformer_example/README.md +165 -0
- Llama-Chinese/inference-speed/GPU/JittorLLMs_example/README.md +96 -0
- Llama-Chinese/inference-speed/GPU/TensorRT-LLM_example/README.md +72 -0
- Llama-Chinese/inference-speed/GPU/TensorRT-LLM_example/atom_inference.py +184 -0
- Llama-Chinese/inference-speed/GPU/TensorRT-LLM_example/utils.py +130 -0
- Llama-Chinese/inference-speed/GPU/lmdeploy_example/README.md +126 -0
- Llama-Chinese/inference-speed/GPU/lmdeploy_example/test_api_server.py +73 -0
- Llama-Chinese/inference-speed/GPU/vllm_example/README.md +57 -0
- Llama-Chinese/inference-speed/GPU/vllm_example/api_server.py +85 -0
- Llama-Chinese/inference-speed/GPU/vllm_example/client_test.py +137 -0
- Llama-Chinese/inference-speed/GPU/vllm_example/multi_gpus_api_server.sh +4 -0
- Llama-Chinese/inference-speed/GPU/vllm_example/single_gpu_api_server.sh +4 -0
- Llama-Chinese/model/Atom-7B-Chat-pre/.gitattributes +2 -0
- Llama-Chinese/model/Atom-7B-Chat-pre/config.json +51 -0
- Llama-Chinese/model/Atom-7B-Chat-pre/generation_config.json +8 -0
- Llama-Chinese/model/Atom-7B-Chat-pre/model-00001-of-00002.safetensors +3 -0
- Llama-Chinese/model/Atom-7B-Chat-pre/model-00002-of-00002.safetensors +3 -0
- Llama-Chinese/model/Atom-7B-Chat-pre/model.safetensors.index.json +522 -0
- Llama-Chinese/output/train.log +1 -0
- Llama-Chinese/quick_start.py +26 -0
Llama-Chinese
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
Subproject commit 5f738c6278256f5ef89a35ba3a5c144320629dfc
|
|
|
|
|
|
Llama-Chinese/.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
model/Atom-7B-Chat-pre/ filter=lfs diff=lfs merge=lfs -text
|
Llama-Chinese/.vscode/launch.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
"name": "Python Debugger: Current File",
|
| 9 |
+
"type": "debugpy",
|
| 10 |
+
"request": "launch",
|
| 11 |
+
"program": "${file}",
|
| 12 |
+
"console": "integratedTerminal"
|
| 13 |
+
}
|
| 14 |
+
]
|
| 15 |
+
}
|
Llama-Chinese/README.md
ADDED
|
@@ -0,0 +1,815 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<p align="left">
|
| 2 |
+
<a href="README_EN.md">English</a> | 中文
|
| 3 |
+
</p>
|
| 4 |
+
|
| 5 |
+
<h1 align="center">
|
| 6 |
+
Llama中文社区
|
| 7 |
+
</h1>
|
| 8 |
+
<p align="center" width="100%">
|
| 9 |
+
<img src="assets/llama.jpg" alt="Llama" style="width: 20%; display: block; margin: auto;"></a>
|
| 10 |
+
</p>
|
| 11 |
+
<p align="center">
|
| 12 |
+
<font face="黑体" color=orange size="6"> Llama3体验和微调已开放,最好的中文Llama大模型 </font>
|
| 13 |
+
</p>
|
| 14 |
+
|
| 15 |
+
<p align="center">
|
| 16 |
+
🤗 <a href="https://huggingface.co/FlagAlpha" target="_blank">Hugging Face</a> • 🤖 <a href="https://www.modelscope.cn/organization/FlagAlpha/" target="_blank">ModelScope</a> • ✡️ <a href="https://wisemodel.cn/models/FlagAlpha/Atom-7B-Chat" target="_blank">WiseModel</a>
|
| 17 |
+
</p>
|
| 18 |
+
|
| 19 |
+
<p align="center">
|
| 20 |
+
<a href="https://llama.family">Llama3.1 在线体验(包含Llama2):https://llama.family</a>
|
| 21 |
+
</p>
|
| 22 |
+
<p align="center">
|
| 23 |
+
<a href="https://huggingface.co/FlagAlpha/Atom-7B-Chat">基于Llama的开源中文预训练大模型Atom</a>
|
| 24 |
+
</p>
|
| 25 |
+
|
| 26 |
+
</br></br>
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
## 🗂️ 目录
|
| 30 |
+
- [📌 Llama中文社区](#-llama中文社区)
|
| 31 |
+
* [🔥 社区介绍:Llama中文社区](#-社区介绍llama中文社区)
|
| 32 |
+
* [📢 最新动态](#-最新动态)
|
| 33 |
+
* [🤗 模型](#-模型)
|
| 34 |
+
+ [🤗 中文预训练模型Atom-7B](#-中文预训练模型atom)
|
| 35 |
+
+ [🤗 Llama3官方模型](#llama3官方模型)
|
| 36 |
+
+ [🤗 Llama3中文微调模型](#llama3中文微调模型)
|
| 37 |
+
+ [🤗 Llama2官方模型](#llama2官方模型)
|
| 38 |
+
+ [🤗 Llama2中文微调模型](#llama2中文微调模型)
|
| 39 |
+
* [🌟 社区资源](#社区资源)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
- [📌 如何使用Llama模型?](#-如何使用llama模型)
|
| 43 |
+
- [快速上手-使用Anaconda](#快速上手-使用anaconda)
|
| 44 |
+
- [快速上手-使用Docker](#快速上手-使用docker)
|
| 45 |
+
- [快速上手-使用llama.cpp](#快速上手-使用llamacpp)
|
| 46 |
+
- [快速上手-使用gradio](#快速上手-使用gradio)
|
| 47 |
+
- [快速上手-构建API服务](#快速上手-构建api服务)
|
| 48 |
+
- [快速上手-使用ollama运行](#快速上手-使用ollama运行)
|
| 49 |
+
|
| 50 |
+
+ [🤖 模型预训练](#-模型预训练)
|
| 51 |
+
+ [💡 模型微调](#-模型微调)
|
| 52 |
+
- [Step1: 环境准备](#step1-环境准备)
|
| 53 |
+
- [Step2: 数据准备](#step2-数据准备)
|
| 54 |
+
- [Step3: 微调脚本](#step3-微调脚本)
|
| 55 |
+
* [LoRA微调](#lora微调)
|
| 56 |
+
* [全量参数微调](#全量参数微调)
|
| 57 |
+
- [Step4: 加载微调模型](#step4-加载微调模型)
|
| 58 |
+
* [LoRA微调](#lora微调-1)
|
| 59 |
+
* [全量参数微调](#全量参数微调-1)
|
| 60 |
+
+ [🍄 模型量化](#-模型量化)
|
| 61 |
+
|
| 62 |
+
+ [🚀 部署加速](#-部署加速)
|
| 63 |
+
- [TensorRT-LLM](#tensorrt-llm)
|
| 64 |
+
- [vLLM](#vllm)
|
| 65 |
+
- [JittorLLMs](#jittorllms)
|
| 66 |
+
- [lmdeploy](#lmdeploy)
|
| 67 |
+
|
| 68 |
+
+ [💪 外延能力](#-外延能力)
|
| 69 |
+
- [LangChain](#langchain)
|
| 70 |
+
|
| 71 |
+
* [🥇 模型评测](#-模型评测)
|
| 72 |
+
+ [Llama2和Llama3对比评测](#llama2和llama3对比评测)
|
| 73 |
+
+ [Llama3模型评测](#llama3模型评测)
|
| 74 |
+
+ [Llama2模型评测](#llama2模型评测)
|
| 75 |
+
|
| 76 |
+
* [📖 学习中心](#-学习中心)
|
| 77 |
+
+ [Llama3](#llama3)
|
| 78 |
+
+ [Llama2](#llama2)
|
| 79 |
+
- [Meta官方对于Llama2的介绍](#meta官方对于llama2的介绍)
|
| 80 |
+
+ [Llama相关论文](#llama相关论文)
|
| 81 |
+
|
| 82 |
+
- [📌 其它](#-其它)
|
| 83 |
+
* [🎉 致谢](#-致谢)
|
| 84 |
+
* [🤔 问题反馈](#-问题反馈)
|
| 85 |
+
|
| 86 |
+
## 📌 Llama中文社区
|
| 87 |
+
|
| 88 |
+
### 🔥 社区介绍:llama中文社区
|
| 89 |
+
|
| 90 |
+
欢迎来到Llama中文社区!我们是一个专注于Llama模型在中文方面的优化和上层建设的高级技术社区。
|
| 91 |
+
**已经基于大规模中文数据,从预训练开始对Llama2模型进行中文能力的持续迭代升级【Done】**。**正在对Llama3模型进行中文能力的持续迭代升级【Doing】**
|
| 92 |
+
我们热忱欢迎对大模型LLM充满热情的开发者和研究者加入我们的行列。
|
| 93 |
+
|
| 94 |
+
<details>
|
| 95 |
+
|
| 96 |
+
#### 为什么选择Llama中文社区?
|
| 97 |
+
🚀 **高级工程师团队支持**:社区有一批专注为大家服务的NLP高级工程师,我们有着强大的技术支持和丰富的经验,为您提供专业的指导和帮助。
|
| 98 |
+
|
| 99 |
+
🎯 **中文优化**:我们致力于在Llama模型的中文处理方面进行优化,探索适用于中文的最佳实践,以提升其性能和适应性【支持Llama2、Llama3】。
|
| 100 |
+
|
| 101 |
+
💡 **创新交流**:我们拥有一支富有创造力和经验的社区成员团队,定期组织线上活动、技术研讨和经验分享,促进成员间的创新交流。
|
| 102 |
+
|
| 103 |
+
🌐 **全球联结**:我们欢迎来自世界各地的开发者加入社区,构建一个开放、多元化的学习和交流平台。
|
| 104 |
+
|
| 105 |
+
🤝 **开放共享**:我们鼓励社区成员开源分享代码和模型,推动合作共赢,共同促进中文NLP技术的发展。
|
| 106 |
+
|
| 107 |
+
#### 社区活动
|
| 108 |
+
🗓️ **线上讲座**:邀请行业内专家进行线上讲座,分享Llama在中文NLP领域的最新技术和应用,探讨前沿研究成果。
|
| 109 |
+
|
| 110 |
+
💻 **项目展示**:成员可展示自己在Llama中文优化方面的项目成果,获得反馈和建议,促进项目协作。
|
| 111 |
+
|
| 112 |
+
📚 **学习资源**:社区维护丰富的学习资料库,包括教程、文档和论文解读,为成员提供全面的学习支持。
|
| 113 |
+
|
| 114 |
+
📝 **论文解读**:社区成员共同解读与Llama相关的最新研究论文,深入理解前沿算法和方法。
|
| 115 |
+
|
| 116 |
+
🎉 **主题活动**:定期举办各类主题活动,包括挑战赛、黑客马拉松和技术沙龙,让社区成员在轻松愉快的氛围中交流和学习。
|
| 117 |
+
|
| 118 |
+
🌟 **奖励计划**:我们设立奖励计划,对社区中积极参与、贡献优秀的成员给予荣誉和奖励,激励更多优秀人才的加入。
|
| 119 |
+
|
| 120 |
+
📈 **技术咨询**:我们提供技术咨询服务,解答您在Llama开发和优化过程中遇到的问题,助您快速攻克难关。
|
| 121 |
+
|
| 122 |
+
🚀 **项目合作**:鼓励成员间的项目合作,共同探索Llama在实际应用中的潜力,打造创新解决方案。
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
#### 立即加入我们!
|
| 126 |
+
📚 **愿景**:无论您是对Llama已有研究和应用经验的专业开发者,还是对Llama中文优化感兴趣并希望深入探索的新手,我们都热切期待您的加入。在Llama中文社区,您将有机会与行业内顶尖人才共同交流,携手推动中文NLP技术的进步,开创更加美好的技术未来!
|
| 127 |
+
|
| 128 |
+
🔗 **温馨提示**:本社区为专业技术交流平台,我们热切期望志同道合的开发者和研究者加入。请遵守社区准则,共同维护积极向上的学习氛围。感谢您的理解和支持!
|
| 129 |
+
|
| 130 |
+
</details>
|
| 131 |
+
|
| 132 |
+
### 📢 最新动态
|
| 133 |
+
|
| 134 |
+
【最新】2024年07月24日:开源最强[Llama 3.1](https://llama.meta.com/docs/overview)模型发布,包含8B、70B和405B!
|
| 135 |
+
|
| 136 |
+
【最新】2024年07月16日:[社区论坛](https://forum.llamafamily.cn/)上线,有大模型问题,就找Llama中文社区!
|
| 137 |
+
|
| 138 |
+
【最新】2024年05月15日:支持ollama运行Llama3-Chinese-8B-Instruct、Atom-7B-Chat,[详细使用方法](https://github.com/LlamaFamily/Llama-Chinese?tab=readme-ov-file#%E5%BF%AB%E9%80%9F%E4%B8%8A%E6%89%8B-%E4%BD%BF%E7%94%A8ollama%E8%BF%90%E8%A1%8C)。
|
| 139 |
+
|
| 140 |
+
【最新】2024年04月23日:社区增加了llama3 8B中文微调模型[Llama3-Chinese-8B-Instruct](https://github.com/LlamaFamily/Llama-Chinese?tab=readme-ov-file#llama3%E4%B8%AD%E6%96%87%E5%BE%AE%E8%B0%83%E6%A8%A1%E5%9E%8B)以及对应的[免费API调用](https://llama.family/docs/chat-completion-v1)。
|
| 141 |
+
|
| 142 |
+
【最新】2024年04月19日:社区增加了llama3 8B、llama3 70B[在线体验链接](https://llama.family/chat/#/)。
|
| 143 |
+
|
| 144 |
+
【最新】2024年04月14日:社区更新了四个专家角色:心理咨询师、羊驼夸夸 、律师、医生。链接:[角色role](https://llama.family/tools/#/agent)。
|
| 145 |
+
|
| 146 |
+
【最新】2024年04月10日:Atom-7B-Chat 模型回答内容相较之前更为丰富、增强了模型的指令遵循能力和回答稳定性、优化了ppo的奖励模型。下载链接[modelscope](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat)、[Huggingface](https://huggingface.co/FlagAlpha/Atom-7B-Chat)。
|
| 147 |
+
|
| 148 |
+
【最新】2024年04月01日:社区上线了Llama中文[应用平台](https://llama.family/store);同时如果你有优秀的的应用需要推广可以填写[申请表](https://atomecho.feishu.cn/share/base/form/shrcnFqpN71OmBoXDCT6y0TQgIc)。
|
| 149 |
+
|
| 150 |
+
【最新】2024年03月08日:开放了免费API供大家使用,包含(Atom-1B,7B,13B 3种中文大模型)[API使用链接](https://llama.family/docs/chat-completion-v1)
|
| 151 |
+
|
| 152 |
+
【最新】2024年04月14日:社区更新了四个专家角色:心理咨询师、羊驼夸夸 、律师、医生。链接:[角色role](https://llama.family/tools/#/agent)。
|
| 153 |
+
|
| 154 |
+
【最新】2024年04月10日:Atom-7B-Chat 模型回答内容相较之前更为丰富、增强了模型的指令遵循能力和回答稳定性、优化了ppo的奖励模型。下载链接[modelscope](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat)、[Huggingface](https://huggingface.co/FlagAlpha/Atom-7B-Chat)。
|
| 155 |
+
|
| 156 |
+
【最新】2024年04月01日:社区上线了Llama中文[应用平台](https://llama.family/store);同时如果你有优秀的的应用需要推广可以填写[申请表](https://atomecho.feishu.cn/share/base/form/shrcnFqpN71OmBoXDCT6y0TQgIc)。
|
| 157 |
+
|
| 158 |
+
【最新】2024年03月28日:[社区免费公开课](https://mp.weixin.qq.com/s/CsturoU1pOX11CqVnZgu2A)。
|
| 159 |
+
|
| 160 |
+
【最新】2024年03月08日:开放了免费API供大家使用,包含(Atom-1B,7B,13B 3种中文大模型)[API使用链接](https://llama.family/docs/chat-completion-v1)
|
| 161 |
+
|
| 162 |
+
【最新】2023年10月8日:新增清华大学JittorLLMs的推理加速功能[JittorLLMs](#jittorllms)!
|
| 163 |
+
|
| 164 |
+
<details>
|
| 165 |
+
|
| 166 |
+
- 2023年9月12日:更新预训练版本[Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B)和对话版本[Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)模型参数,最新的中文预训练数据量为2.7TB token,训练进程见[llama.family](https://llama.family/)!
|
| 167 |
+
|
| 168 |
+
- 2023年9月2日:新增模型[预训练代码](#-模型预训练)和[全量参数微调代码](#-模型微调)!
|
| 169 |
+
|
| 170 |
+
- 2023年8月28日:发布基于Llama2进行中文预训练的开源大模型[Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B),并将持续更新,详情参考[社区��众号文章](https://mp.weixin.qq.com/s/Bdx0JTVh1kgPn5ydYxIkEw)!
|
| 171 |
+
|
| 172 |
+
- 2023年8月26日:提供[FastAPI](#fastapi接口搭建)接口搭建脚本!
|
| 173 |
+
|
| 174 |
+
- 2023年8月26日:提供将Meta原始模型参数转换为兼容Hugging Face的[格式转化脚本](https://github.com/LlamaFamily/Llama-Chinese/blob/main/scripts/convert2hf/README.md)!
|
| 175 |
+
|
| 176 |
+
- 2023年8月26日:新增[Code Llama](#-代码模型)模型!
|
| 177 |
+
|
| 178 |
+
- 2023年8月15日:新增[PEFT加载微调模型参数](#加载微调模型)的代码示例!
|
| 179 |
+
|
| 180 |
+
- 2023年8月14日:[大模型数据共享训练平台](https://llama.family)上线,没有算力也能参与大模型训练,社区每位成员贡献的数据都将决定模型能力的未来走向!
|
| 181 |
+
|
| 182 |
+
- 2023年8月3日:新增FasterTransformer和vLLM的GPU[推理加速](#-推理加速)支持!
|
| 183 |
+
|
| 184 |
+
- 2023年7月31日:【重磅】国内首个真正意义上的Llama2中文大模型发布!详情参见[社区公众号文章](https://mp.weixin.qq.com/s/lExUU7z_MvgJ7tzQPF8tUQ)
|
| 185 |
+
|
| 186 |
+
- 2023年7月28日:通过[Docker部署](#docker部署问答接口)问答接口!
|
| 187 |
+
|
| 188 |
+
- 2023年7月27日:新增[LangChain](#langchain)支持!
|
| 189 |
+
|
| 190 |
+
- 2023年7月26日:新增Llama2-13B中文微调参数的[4bit量化压缩版本](#-模型量化)!
|
| 191 |
+
|
| 192 |
+
- 2023年7月25日:社区微信公众号“Llama中文社区”欢迎大家关注,获取最新分享和动态!
|
| 193 |
+
|
| 194 |
+
- 2023年7月24日:[FlagAlpha](https://huggingface.co/FlagAlpha)新增Llama2-13B中文微调参数!
|
| 195 |
+
|
| 196 |
+
- 2023年7月24日:[llama.family](https://llama.family/)新增Llama2-70B在线体验!
|
| 197 |
+
|
| 198 |
+
- 2023年7月23日:Llama2中文微调参数发布至Hugging Face仓库[FlagAlpha](https://huggingface.co/FlagAlpha)!
|
| 199 |
+
|
| 200 |
+
- 2023年7月22日:Llama2在线体验链接[llama.family](https://llama.family/)上线,同时包含Meta原版和中文微调版本!
|
| 201 |
+
|
| 202 |
+
- 2023年7月21日:评测了Meta原始版Llama2 Chat模型的[中文问答能力](#-模型评测)!
|
| 203 |
+
|
| 204 |
+
- 2023年7月21日:新增Llama2模型的Hugging Face版本国内下载地址!
|
| 205 |
+
|
| 206 |
+
- 2023年7月20日:新增[飞书知识库文档](https://chinesellama.feishu.cn/wiki/space/7257824476874768388?ccm_open_type=lark_wiki_spaceLink),欢迎大家一起共建!
|
| 207 |
+
|
| 208 |
+
- 2023年7月20日:国内Llama2最新下载地址上线!
|
| 209 |
+
|
| 210 |
+
- 2023年7月19日:正式启动Llama2模型的中文预训练,关注我们获取实时动态!
|
| 211 |
+
|
| 212 |
+
- 2023年7月19日:Llama2国内下载地址正在启动,敬请期待!
|
| 213 |
+
|
| 214 |
+
- 2023年7月19日:开启Llama2中文社区,欢迎大家加入!
|
| 215 |
+
|
| 216 |
+
</details>
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
### 🤗 模型
|
| 220 |
+
|
| 221 |
+
#### 🔵 中文预训练模型Atom
|
| 222 |
+
|
| 223 |
+
**原子大模型Atom**由Llama中文社区和原子回声联合打造。
|
| 224 |
+
|
| 225 |
+
| 类别 | 模型名称 | 🤗模型加载名称 | 下载地址 |
|
| 226 |
+
| --------------- | --------------- | ------------------------------ | ------------------------------------------------------------ |
|
| 227 |
+
| 预训练 | Atom-7B | FlagAlpha/Atom-7B | [HuggingFace](https://huggingface.co/FlagAlpha/Atom-7B) \| [ModelScope](https://modelscope.cn/models/FlagAlpha/Atom-7B) \| [WiseModel](https://wisemodel.cn/models/FlagAlpha/Atom-7B) |
|
| 228 |
+
| Chat | Atom-7B-Chat | FlagAlpha/Atom-7B-Chat | [HuggingFace](https://huggingface.co/FlagAlpha/Atom-7B-Chat) \| [ModelScope](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat) \| [WiseModel](https://wisemodel.cn/models/FlagAlpha/Atom-7B-Chat)|
|
| 229 |
+
|
| 230 |
+
Atom系列模型包含Atom-13B、Atom-7B和Atom-1B,基于Llama2做了中文能力的持续优化。Atom-7B和Atom-7B-Chat目前已完全开源,支持商用,可在[Hugging Face](https://huggingface.co/FlagAlpha)仓库获取模型,详情见[Atom-7B下载](#基于llama2的中文预训练模型atom)。Atom大模型针对中文做了以下优化:
|
| 231 |
+
|
| 232 |
+
- 大规模的中文数据预训练
|
| 233 |
+
|
| 234 |
+
原子大模型Atom在Llama2的基础上,采用大规模的中文数据进行持续预训练,包含百科、书籍、博客、新闻、公告、小说、金融数据、法律数据、医疗数据、代码数据、专业论文数据、中文自然语言处理竞赛数据集等,详见[📝 数据来源](#-数据来源)。
|
| 235 |
+
|
| 236 |
+
同时对庞大的数据进行了过滤、打分、去重,筛选出超过1T token的高质量中文数据,持续不断加入训练迭代中。
|
| 237 |
+
|
| 238 |
+
- 更高效的中文词表
|
| 239 |
+
为了提高中文文本处理的效率,我们针对Llama2模型的词表进行了深度优化。首先,我们基于数百G的中文文本,在该模型词表的基础上扩展词库至65,000个单词。经过测试,我们的改进使得中文编码/解码速度提高了约350%。此外,我们还扩大了中文字符集的覆盖范围,包括所有emoji符号😊。这使得生成带有表情符号的文章更加高效。
|
| 240 |
+
|
| 241 |
+
- 自适应上下文扩展
|
| 242 |
+
Atom大模型默认支持4K上下文,利用位置插值PI和Neural Tangent Kernel (NTK)方法,经过微调可以将上下文长度扩增到32K。
|
| 243 |
+
|
| 244 |
+
- 📝 中文数据
|
| 245 |
+
|
| 246 |
+
我们通过以下数据来优化Llama2的中文能力:
|
| 247 |
+
|
| 248 |
+
| 类型 | 描述 |
|
| 249 |
+
| ---------------------------------------------------------- | ------------------------------------------------------------ |
|
| 250 |
+
| 网络数据 | 互联网上公开的网络数据,挑选出去重后的高质量中文数据,涉及到百科、书籍、博客、新闻、公告、小说等高质量长文本数据。 |
|
| 251 |
+
| [Wikipedia](https://github.com/goldsmith/Wikipedia) | 中文Wikipedia的数据 |
|
| 252 |
+
| [悟道](https://github.com/BAAI-WuDao/Model) | 中文悟道开源的200G数据 |
|
| 253 |
+
| [Clue](https://github.com/CLUEbenchmark/CLUEDatasetSearch) | Clue开放的中文预训练数据,进行清洗后的高质量中文长文本数据 |
|
| 254 |
+
| 竞赛数据集 | 近年来中文自然语言处理多任务竞赛数据集,约150个 |
|
| 255 |
+
| [MNBVC](https://github.com/esbatmop/MNBVC) | MNBVC 中清洗出来的部分数据集
|
| 256 |
+
|
| 257 |
+
社区提供预训练版本Atom-7B和基于Atom-7B进行对话微调的模型参数供开放下载,关于模型的进展详见社区官网[llama.family](https://llama.family)。
|
| 258 |
+
|
| 259 |
+
#### Llama3官方模型
|
| 260 |
+
|
| 261 |
+
| 类别 | 模型名称 | 🤗模型加载名称 | 下载地址 |
|
| 262 |
+
| ---------- | ---------- | ------------------------- | --------------------- |
|
| 263 |
+
| 预训练 | Llama3-8B | meta-llama/Meta-Llama-3-8B | [HuggingFace](https://huggingface.co/meta-llama/Meta-Llama-3-8B) \| [百度网盘](https://pan.baidu.com/s/1gBZ7wEn3gC8VRok0Onh9BQ?pwd=8frq) |
|
| 264 |
+
| 预训练 | Llama3-70B | meta-llama/Meta-Llama-3-70B | [HuggingFace](https://huggingface.co/meta-llama/Meta-Llama-3-7B) \| [百度网盘](https://pan.baidu.com/s/1gBZ7wEn3gC8VRok0Onh9BQ?pwd=8frq) |
|
| 265 |
+
| 对话模型 | Llama3-8B-Chat | meta-llama/Meta-Llama-3-8B-Instruct | [HuggingFace](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) \| [百度网盘](https://pan.baidu.com/s/1gBZ7wEn3gC8VRok0Onh9BQ?pwd=8frq) |
|
| 266 |
+
| 对话模型 | Llama3-70B-Chat | meta-llama/Meta-Llama-3-70B-Instruct | [HuggingFace](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) \| [百度网盘](https://pan.baidu.com/s/1gBZ7wEn3gC8VRok0Onh9BQ?pwd=8frq) |
|
| 267 |
+
|
| 268 |
+
#### Llama3中文微调模型
|
| 269 |
+
|
| 270 |
+
| 类别 | 模型名称 | 🤗模型加载名称 | 下载地址 |
|
| 271 |
+
| ---------- | ---------- | ------------------------- | --------------------- |
|
| 272 |
+
| 对话模型 | Llama3-Chinese-8B-Instruct | FlagAlpha/Llama3-Chinese-8B-Instruct | [HuggingFace](https://huggingface.co/FlagAlpha/Llama3-Chinese-8B-Instruct) \| [modelscope](https://modelscope.cn/models/FlagAlpha/Llama3-Chinese-8B-Instruct/summary) \| [wisemodel](https://wisemodel.cn/models/FlagAlpha/Llama3-Chinese-8B-Instruct/file) |
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
#### Llama2官方模型
|
| 276 |
+
|
| 277 |
+
<details>
|
| 278 |
+
|
| 279 |
+
| 类别 | 模型名称 | 🤗模型加载名称 | 下载地址 |
|
| 280 |
+
| ---------- | ---------- | ------------------------- | --------------------- |
|
| 281 |
+
| 预训练 | Llama2-7B | meta-llama/Llama-2-7b-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-7b-hf) \| [迅雷网盘](https://pan.xunlei.com/s/VN_t0dUikZqOwt-5DZWHuMvqA1?pwd=66ep) |
|
| 282 |
+
| 预训练 | Llama2-13B | meta-llama/Llama-2-13b-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-hf) \| [迅雷网盘](https://pan.xunlei.com/s/VN_yT_9G8xNOz0SDWQ7Mb_GZA1?pwd=yvgf) |
|
| 283 |
+
| 预训练 | Llama2-70B | meta-llama/Llama-2-70b-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-70b-hf) |
|
| 284 |
+
| Chat | Llama2-7B-Chat | meta-llama/Llama-2-7b-chat-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) \| [迅雷网盘](https://pan.xunlei.com/s/VN_oaV4BpKFgKLto4KgOhBcaA1?pwd=ufir) |
|
| 285 |
+
| Chat | Llama2-13B-Chat | meta-llama/Llama-2-13b-chat-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) \| [迅雷网盘](https://pan.xunlei.com/s/VN_yA-9G34NGL9B79b3OQZZGA1?pwd=xqrg) |
|
| 286 |
+
| Chat | Llama2-70B-Chat | meta-llama/Llama-2-70b-chat-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) \| [迅雷网盘](https://pan.xunlei.com/s/VNa_vCGzCy3h3N7oeFXs2W1hA1?pwd=uhxh#) |
|
| 287 |
+
| Code | CodeLlama-7b | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/1cIPzdNywWLvQI7_2QanOEQ?pwd=zfwi) |
|
| 288 |
+
| Code | CodeLlama-7b-Python | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/1liY8klGoDagYbpw-g-oFag?pwd=i952) |
|
| 289 |
+
| Code | CodeLlama-7b-Instruct | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/108o9_DT2E_vfSGtOnDCQVw?pwd=zkt9) |
|
| 290 |
+
| Code | CodeLlama-13b | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/1lLaeHv0XEBv0iiZzI1dpnw?pwd=qn99) |
|
| 291 |
+
| Code | CodeLlama-13b-Python | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/1OLVfvZS_oqL3oqMKwsI87w?pwd=a78k) |
|
| 292 |
+
| Code | CodeLlama-13b-Instruct | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/1HyxJl4w8wElgkZRh2ATrXQ?pwd=seg6) |
|
| 293 |
+
| Code | CodeLlama-34b | meta-llama/Llama-2-70b-chat-hf | [迅雷网盘](https://pan.baidu.com/s/1vEw0pFgIkctPUN4_5_6pIQ?pwd=q8eu) |
|
| 294 |
+
|
| 295 |
+
Meta官方在2023年8月24日发布了Code Llama,基于代码数据对Llama2进行了微调,提供三个不同功能的版本:基础模型(Code Llama)、Python专用模型(Code Llama - Python)和指令跟随模型(Code Llama - Instruct),包含7B、13B、34B三种不同参数规模。不同模型能力区别如下表所示:
|
| 296 |
+
|
| 297 |
+
| 模型类别 | 模型名称 | 代码续写 | 代码填充 | 指令编程 |
|
| 298 |
+
|-----------------------|------------------------|------|------|------|
|
| 299 |
+
| Code Llama | CodeLlama-7b | ✅ | ✅ | ❌ |
|
| 300 |
+
| | CodeLlama-13b | ✅ | ✅ | ❌ |
|
| 301 |
+
| | CodeLlama-34b | ✅ | ❌ | ❌ |
|
| 302 |
+
| Code Llama - Python | CodeLlama-7b-Python | ✅ | ❌ | ❌ |
|
| 303 |
+
| | CodeLlama-13b-Python | ✅ | ❌ | ❌ |
|
| 304 |
+
| | CodeLlama-34b-Python | ✅ | ❌ | ❌ |
|
| 305 |
+
| Code Llama - Instruct | CodeLlama-7b-Instruct | ❌ | ✅ | ✅ |
|
| 306 |
+
| | CodeLlama-13b-Instruct | ❌ | ✅ | ✅ |
|
| 307 |
+
| | CodeLlama-34b-Instruct | ❌ | ❌ | ✅ |
|
| 308 |
+
|
| 309 |
+
关于Code Llama的详细信息可以参考官方Github仓库[codellama](https://github.com/facebookresearch/codellama)。
|
| 310 |
+
|
| 311 |
+
</details>
|
| 312 |
+
|
| 313 |
+
#### Llama2中文微调模型
|
| 314 |
+
|
| 315 |
+
我们基于中文指令数据集对Llama2-Chat模型进行了微调,使得Llama2模型有着更强的中文对话能力。LoRA参数以及与基础模型合并的参数均已上传至[Hugging Face](https://huggingface.co/FlagAlpha),目前包含7B和13B的模型。
|
| 316 |
+
|
| 317 |
+
| 类别 | 模型名称 | 🤗模型加载名称 | 基础模型版本 | 下载地址 |
|
| 318 |
+
| ---------- | ---------- | ------------- | ----------------- | ------------------- |
|
| 319 |
+
| 合并参数 | Llama2-Chinese-7b-Chat | FlagAlpha/Llama2-Chinese-7b-Chat | meta-llama/Llama-2-7b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-7b-Chat) |
|
| 320 |
+
| 合并参数 | Llama2-Chinese-13b-Chat | FlagAlpha/Llama2-Chinese-13b-Chat| meta-llama/Llama-2-13b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat) |
|
| 321 |
+
| LoRA参数 | Llama2-Chinese-7b-Chat-LoRA | FlagAlpha/Llama2-Chinese-7b-Chat-LoRA | meta-llama/Llama-2-7b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-7b-Chat-LoRA) |
|
| 322 |
+
| LoRA参数 | Llama2-Chinese-13b-Chat-LoRA | FlagAlpha/Llama2-Chinese-13b-Chat-LoRA | meta-llama/Llama-2-13b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat-LoRA) |
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
### 社区资源
|
| 326 |
+
社区资源的丰富性是社区发展的重要保障,它涵盖了各种方面,其中包括但不限于以下四个方面:算力、数据、论坛和应用。在这些方面的积极发展与充分利用,将为社区成员提供更多的机会和支持,推动整个社区向着更加繁荣的方向发展。更多的内容请看[llama.family](https://llama.family/)
|
| 327 |
+
|
| 328 |
+
<details>
|
| 329 |
+
|
| 330 |
+
#### 💻 算力
|
| 331 |
+
- 提供低于市场价格的算力资源,可用于各类计算任务,如深度学习模型的训练、推理等。
|
| 332 |
+
- 为社区成员提供专属的在线推理服务,让用户可以快速有效地对模型进行推理操作。
|
| 333 |
+
- 提供一键在线微调服务,使用户可以方便地对模型进行微调,以适应不同的任务和数据。
|
| 334 |
+
|
| 335 |
+
#### 📊 数据
|
| 336 |
+
- 开放丰富的训练数据资源,覆盖多个领域和行业,为模型训练提供充足的数据支持。
|
| 337 |
+
- 提供高质量、多样化的数据集,以满足不同用户的需求,并支持数据共享和交流,促进数据资源的充分利用。
|
| 338 |
+
|
| 339 |
+
#### 💬 论坛
|
| 340 |
+
- 社区论坛为社区成员提供了一个在线交流和讨论技术问题的平台。
|
| 341 |
+
- 在论坛上,用户可以分享经验、提出问题、解答疑惑,促进技术交流和合作。
|
| 342 |
+
- 论坛还可以定期举办线上活动、研讨会等,增进社区成员之间的联系和了解。
|
| 343 |
+
|
| 344 |
+
#### 📱 应用
|
| 345 |
+
- 免费提供应用推广展示位,让开发者可以将他们的应用充分展示给社区成员。
|
| 346 |
+
- 提供推广的帮助,包括但不限于宣传推广、用户引导等服务,帮助应用获得更多的曝光和用户。
|
| 347 |
+
- 通过社区平台,为优秀的应用提供合作机会,促进应用开发者之间的合作和交流,共同推动应用的发展和壮大。
|
| 348 |
+
|
| 349 |
+
</details>
|
| 350 |
+
|
| 351 |
+
## 📌 如何使用Llama模型?
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
你可以选择下面的快速上手的任一种方式,开始使用 Llama 系列��型。推荐使用[中文预训练对话模型](#llama2中文预训练模型atom-7b)进行使用,对中文的效果支持更好。
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
### 快速上手-使用Anaconda
|
| 358 |
+
|
| 359 |
+
第 0 步:前提条件
|
| 360 |
+
- 确保安装了 Python 3.10 以上版本。
|
| 361 |
+
|
| 362 |
+
第 1 步:准备环境
|
| 363 |
+
|
| 364 |
+
如需设置环境,安装所需要的软件包,运行下面的命令。
|
| 365 |
+
```bash
|
| 366 |
+
git clone https://github.com/LlamaFamily/Llama-Chinese.git
|
| 367 |
+
cd Llama-Chinese
|
| 368 |
+
pip install -r requirements.txt
|
| 369 |
+
```
|
| 370 |
+
|
| 371 |
+
第 2 步:下载模型
|
| 372 |
+
|
| 373 |
+
你可以从以下来源下载Atom-7B-Chat模型。
|
| 374 |
+
- [HuggingFace](https://huggingface.co/FlagAlpha)
|
| 375 |
+
- [ModelScope](https://modelscope.cn/organization/FlagAlpha)
|
| 376 |
+
- [WiseModel](https://wisemodel.cn/models/FlagAlpha/Atom-7B-Chat)
|
| 377 |
+
|
| 378 |
+
第 3 步:进行推理
|
| 379 |
+
|
| 380 |
+
使用Atom-7B-Chat模型进行推理
|
| 381 |
+
创建一个名为 quick_start.py 的文件,并将以下内容复制到该文件中。
|
| 382 |
+
```python
|
| 383 |
+
import torch
|
| 384 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 385 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 386 |
+
model = AutoModelForCausalLM.from_pretrained('FlagAlpha/Atom-7B-Chat',device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
|
| 387 |
+
model =model.eval()
|
| 388 |
+
tokenizer = AutoTokenizer.from_pretrained('FlagAlpha/Atom-7B-Chat',use_fast=False)
|
| 389 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 390 |
+
input_ids = tokenizer(['<s>Human: 介绍一下中国\n</s><s>Assistant: '], return_tensors="pt",add_special_tokens=False).input_ids
|
| 391 |
+
if torch.cuda.is_available():
|
| 392 |
+
input_ids = input_ids.to('cuda')
|
| 393 |
+
generate_input = {
|
| 394 |
+
"input_ids":input_ids,
|
| 395 |
+
"max_new_tokens":512,
|
| 396 |
+
"do_sample":True,
|
| 397 |
+
"top_k":50,
|
| 398 |
+
"top_p":0.95,
|
| 399 |
+
"temperature":0.3,
|
| 400 |
+
"repetition_penalty":1.3,
|
| 401 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 402 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 403 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 404 |
+
}
|
| 405 |
+
generate_ids = model.generate(**generate_input)
|
| 406 |
+
text = tokenizer.decode(generate_ids[0])
|
| 407 |
+
print(text)
|
| 408 |
+
```
|
| 409 |
+
|
| 410 |
+
运行 quick_start.py 代码。
|
| 411 |
+
```bash
|
| 412 |
+
python quick_start.py
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
### 快速上手-使用Docker
|
| 416 |
+
|
| 417 |
+
详情参见:[Docker部署](https://github.com/LlamaFamily/Llama-Chinese/blob/main/docs/chat_gradio_guide.md)
|
| 418 |
+
|
| 419 |
+
第 1 步:准备docker镜像,通过docker容器启动[chat_gradio.py](../examples/chat_gradio.py)
|
| 420 |
+
```bash
|
| 421 |
+
git clone https://github.com/LlamaFamily/Llama-Chinese.git
|
| 422 |
+
|
| 423 |
+
cd Llama-Chinese
|
| 424 |
+
|
| 425 |
+
docker build -f docker/Dockerfile -t flagalpha/llama2-chinese:gradio .
|
| 426 |
+
```
|
| 427 |
+
|
| 428 |
+
第 2 步:通过docker-compose启动chat_gradio
|
| 429 |
+
```bash
|
| 430 |
+
cd Llama-Chinese/docker
|
| 431 |
+
docker-compose up -d --build
|
| 432 |
+
```
|
| 433 |
+
|
| 434 |
+
### 快速上手-使用llama.cpp
|
| 435 |
+
详情参见:[使用llama.cpp](https://github.com/LlamaFamily/Llama-Chinese/blob/main/inference-speed/CPU/ggml/README.md)
|
| 436 |
+
|
| 437 |
+
### 快速上手-使用gradio
|
| 438 |
+
基于gradio搭建的问答界面,实现了流式的输出,将下面代码复制到控制台运行,以下代码以Atom-7B-Chat模型为例,不同模型只需修改一下面的model_name_or_path对应的模型名称就好了😊
|
| 439 |
+
```
|
| 440 |
+
python examples/chat_gradio.py --model_name_or_path FlagAlpha/Atom-7B-Chat
|
| 441 |
+
```
|
| 442 |
+
|
| 443 |
+
### 快速上手-构建API服务
|
| 444 |
+
使用FastChat构建和OpenAI一致的推理服务接口。
|
| 445 |
+
|
| 446 |
+
<details>
|
| 447 |
+
第 0 步:前提条件
|
| 448 |
+
|
| 449 |
+
安装fastchat
|
| 450 |
+
```bash
|
| 451 |
+
pip3 install "fschat[model_worker,webui]"
|
| 452 |
+
```
|
| 453 |
+
第 1 步:启动Restful API
|
| 454 |
+
|
| 455 |
+
开启三个控制台分别执行下面的三个命令
|
| 456 |
+
- 首先启动controler
|
| 457 |
+
```bash
|
| 458 |
+
python3 -m fastchat.serve.controller \
|
| 459 |
+
--host localhost \
|
| 460 |
+
--port 21001
|
| 461 |
+
```
|
| 462 |
+
|
| 463 |
+
- 启动模型
|
| 464 |
+
```bash
|
| 465 |
+
CUDA_VISIBLE_DEVICES="0" python3 -m fastchat.serve.model_worker --model-path /path/Atom-7B-Chat \
|
| 466 |
+
--host localhost \
|
| 467 |
+
--port 21002 \
|
| 468 |
+
--worker-address "http://localhost:21002" \
|
| 469 |
+
--limit-worker-concurrency 5 \
|
| 470 |
+
--stream-interval 2 \
|
| 471 |
+
--gpus "1" \
|
| 472 |
+
--load-8bit
|
| 473 |
+
```
|
| 474 |
+
|
| 475 |
+
- 启动RESTful API 服务
|
| 476 |
+
```bash
|
| 477 |
+
python3 -m fastchat.serve.openai_api_server \
|
| 478 |
+
--host localhost \
|
| 479 |
+
--port 21003 \
|
| 480 |
+
--controller-address http://localhost:21001
|
| 481 |
+
```
|
| 482 |
+
|
| 483 |
+
第 2 步:测试api服务
|
| 484 |
+
|
| 485 |
+
执行下面的python代码测试上面部署的api服务
|
| 486 |
+
```python
|
| 487 |
+
# coding=utf-8
|
| 488 |
+
import json
|
| 489 |
+
import time
|
| 490 |
+
import urllib.request
|
| 491 |
+
import sys
|
| 492 |
+
import requests
|
| 493 |
+
|
| 494 |
+
def test_api_server(input_text):
|
| 495 |
+
header = {'Content-Type': 'application/json'}
|
| 496 |
+
|
| 497 |
+
data = {
|
| 498 |
+
"messages": [{"role": "system", "content": ""}, {"role": "user", "content": input_text}],
|
| 499 |
+
"temperature": 0.3,
|
| 500 |
+
"top_p" : 0.95,
|
| 501 |
+
"max_tokens": 512,
|
| 502 |
+
"model": "LLama2-Chinese-13B",
|
| 503 |
+
"stream" : False,
|
| 504 |
+
"n" : 1,
|
| 505 |
+
"best_of": 1,
|
| 506 |
+
"presence_penalty": 1.2,
|
| 507 |
+
"frequency_penalty": 0.2,
|
| 508 |
+
"top_k": 50,
|
| 509 |
+
"use_beam_search": False,
|
| 510 |
+
"stop": [],
|
| 511 |
+
"ignore_eos" :False,
|
| 512 |
+
"logprobs": None
|
| 513 |
+
}
|
| 514 |
+
response = requests.post(
|
| 515 |
+
url='http://127.0.0.1:21003/v1/chat/completions',
|
| 516 |
+
headers=header,
|
| 517 |
+
data=json.dumps(data).encode('utf-8')
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
+
result = None
|
| 521 |
+
try:
|
| 522 |
+
result = json.loads(response.content)
|
| 523 |
+
print(json.dumps(data, ensure_ascii=False, indent=2))
|
| 524 |
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
| 525 |
+
|
| 526 |
+
except Exception as e:
|
| 527 |
+
print(e)
|
| 528 |
+
|
| 529 |
+
return result
|
| 530 |
+
|
| 531 |
+
if __name__ == "__main__":
|
| 532 |
+
test_api_server("如何去北京?")
|
| 533 |
+
```
|
| 534 |
+
|
| 535 |
+
</details>
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
### 快速上手-使用ollama运行
|
| 539 |
+
|
| 540 |
+
1. 首先需要安装ollama工具
|
| 541 |
+
|
| 542 |
+
安装方法参考:[https://ollama.com](https://ollama.com/)
|
| 543 |
+
|
| 544 |
+
2. ollama运行Llama3-Chinese-8B-Instruct、Atom-7B-Chat
|
| 545 |
+
|
| 546 |
+
ollama运行基于Llama3进行中文微调的大模型[Llama3-Chinese-8B-Instruct](https://huggingface.co/FlagAlpha/Llama3-Chinese-8B-Instruct)
|
| 547 |
+
|
| 548 |
+
打开命令行执行命令
|
| 549 |
+
```
|
| 550 |
+
ollama run llamafamily/llama3-chinese-8b-instruct
|
| 551 |
+
```
|
| 552 |
+
|
| 553 |
+
ollama运行基于Llama2进行中文预训练的开源大模型[Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)
|
| 554 |
+
|
| 555 |
+
打开命令行执行命令
|
| 556 |
+
```
|
| 557 |
+
ollama run llamafamily/atom-7b-chat
|
| 558 |
+
```
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
## 🤖 模型预训练
|
| 562 |
+
虽然Llama2的预训练数据相对于第一代LLaMA扩大了一倍,但是中文预训练数据的比例依然非常少,仅占0.13%,这也导致了原始Llama2的中文能力较弱。为了能够提升模型的中文能力,可以采用微调和预训练两种路径,其中:
|
| 563 |
+
- 微调需要的算力资源少,能够快速实现一个中文Llama的雏形。但缺点也显而易见,只能激发基座模型已有的中文能力,由于Llama2的中文训练数据本身较少,所以能够激发的能力也有限,治标不治本。
|
| 564 |
+
|
| 565 |
+
- 基于大规模中文语料进行预训练,成本高,不仅需要大规模高质量的中文数据,也需要大规模的算力资源。但是优点也显而易见,就是能从模型底层优化中文能力,真正达到治本的效果,从内核为大模型注入强大的中文能力。
|
| 566 |
+
|
| 567 |
+
我们为社区提供了Llama模型的预训练代码,以及[中文测试语料](https://github.com/LlamaFamily/Llama-Chinese/tree/main/data),更多数据可以参考[中文语料](#-中文数据)。具体代码和配置如下:
|
| 568 |
+
- 模型预训练脚本:[train/pretrain/pretrain.sh](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/pretrain.sh)
|
| 569 |
+
- 预训练实现代码:[train/pretrain/pretrain_clm.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/pretrain_clm.py)
|
| 570 |
+
- [DeepSpeed](https://github.com/microsoft/DeepSpeed)加速:
|
| 571 |
+
- 对于单卡训练,可以采用ZeRO-2的方式,参数配置见 [train/pretrain/ds_config_zero2.json](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/ds_config_zero2.json)
|
| 572 |
+
- 对于多卡训练,可以采用ZeRO-3的方式,参数配置见 [train/pretrain/ds_config_zero3.json](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/ds_config_zero3.json)
|
| 573 |
+
- 训练效果度量指标:[train/pretrain/accuracy.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/accuracy.py)
|
| 574 |
+
|
| 575 |
+
## 💡 模型微调
|
| 576 |
+
|
| 577 |
+
本仓库中同时提供了LoRA微调和全量参数微调代码,关于LoRA的详细介绍可以参考论文“[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)”以及微软Github仓库[LoRA](https://github.com/microsoft/LoRA)。
|
| 578 |
+
|
| 579 |
+
### Step1: 环境准备
|
| 580 |
+
|
| 581 |
+
根据[requirements.txt](https://github.com/LlamaFamily/Llama-Chinese/blob/main/requirements.txt)安装对应的环境依赖。
|
| 582 |
+
|
| 583 |
+
### Step2: 数据准备
|
| 584 |
+
在data目录下提供了一份用于模型sft的数据样例:
|
| 585 |
+
- 训练数据:[data/train_sft.csv](https://github.com/LlamaFamily/Llama-Chinese/blob/main/data/train_sft.csv)
|
| 586 |
+
- 验证数据:[data/dev_sft.csv](https://github.com/LlamaFamily/Llama-Chinese/blob/main/data/dev_sft.csv)
|
| 587 |
+
|
| 588 |
+
每个csv文件中包含一列“text”,每一行为一个训练样例,每个训练样例按照以下格式将问题和答案组织为模型输入,您可以按照以下格式自定义训练和验证数据集:
|
| 589 |
+
```
|
| 590 |
+
"<s>Human: "+问题+"\n</s><s>Assistant: "+答案+"\n"</s>
|
| 591 |
+
```
|
| 592 |
+
例如,
|
| 593 |
+
```
|
| 594 |
+
<s>Human: 用一句话描述地球为什么是独一无二的。</s><s>Assistant: 因为地球是目前为止唯一已知存在生命的行星。</s>
|
| 595 |
+
```
|
| 596 |
+
|
| 597 |
+
### Step3: 微调脚本
|
| 598 |
+
|
| 599 |
+
#### LoRA微调
|
| 600 |
+
LoRA微调脚本见:[train/sft/finetune_lora.sh](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune_lora.sh),关于LoRA微调的具体实现代码见[train/sft/finetune_clm_lora.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune_clm_lora.py),单机多卡的微调可以通过修改脚本中的`--include localhost:0`来实现。
|
| 601 |
+
|
| 602 |
+
#### 全量参数微调
|
| 603 |
+
全量参数微调脚本见:[train/sft/finetune.sh](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune.sh),关于全量参数微调的具体实现代码见[train/sft/finetune_clm.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune_clm.py)。
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
### Step4: 加载微调模型
|
| 607 |
+
|
| 608 |
+
#### LoRA微调
|
| 609 |
+
基于LoRA微调的模型参数见:[基于Llama2的中文微调模型](#llama2中文微调模型),LoRA参数需���和基础模型参数结合使用。
|
| 610 |
+
|
| 611 |
+
通过[PEFT](https://github.com/huggingface/peft)加载预训练模型参数和微调模型参数,以下示例代码中,base_model_name_or_path为预训练模型参数保存路径,finetune_model_path为微调模型参数保存路径。
|
| 612 |
+
|
| 613 |
+
```python
|
| 614 |
+
import torch
|
| 615 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 616 |
+
from peft import PeftModel,PeftConfig
|
| 617 |
+
# 例如: finetune_model_path='FlagAlpha/Llama2-Chinese-7b-Chat-LoRA'
|
| 618 |
+
finetune_model_path=''
|
| 619 |
+
config = PeftConfig.from_pretrained(finetune_model_path)
|
| 620 |
+
# 例如: base_model_name_or_path='meta-llama/Llama-2-7b-chat'
|
| 621 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,use_fast=False)
|
| 622 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 623 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 624 |
+
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
|
| 625 |
+
model = PeftModel.from_pretrained(model, finetune_model_path, device_map={"": 0})
|
| 626 |
+
model =model.eval()
|
| 627 |
+
input_ids = tokenizer(['<s>Human: 介绍一下北京\n</s><s>Assistant: '], return_tensors="pt",add_special_tokens=False).input_ids
|
| 628 |
+
if torch.cuda.is_available():
|
| 629 |
+
input_ids = input_ids.to('cuda')
|
| 630 |
+
generate_input = {
|
| 631 |
+
"input_ids":input_ids,
|
| 632 |
+
"max_new_tokens":512,
|
| 633 |
+
"do_sample":True,
|
| 634 |
+
"top_k":50,
|
| 635 |
+
"top_p":0.95,
|
| 636 |
+
"temperature":0.3,
|
| 637 |
+
"repetition_penalty":1.3,
|
| 638 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 639 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 640 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 641 |
+
}
|
| 642 |
+
generate_ids = model.generate(**generate_input)
|
| 643 |
+
text = tokenizer.decode(generate_ids[0])
|
| 644 |
+
print(text)
|
| 645 |
+
```
|
| 646 |
+
|
| 647 |
+
#### 全量参数微调
|
| 648 |
+
对于全量参数微调的模型,调用方式同[模型调用代码示例](#模型调用代码示例),只需要修改其中的模型名称或者保存路径即可。
|
| 649 |
+
|
| 650 |
+
## 🍄 模型量化
|
| 651 |
+
我们对中文微调的模型参数进行了量化,方便以更少的计算资源运行。目前已经在[Hugging Face](https://huggingface.co/FlagAlpha)上传了13B中文微调模型[FlagAlpha/Llama2-Chinese-13b-Chat](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat)的4bit压缩版本[FlagAlpha/Llama2-Chinese-13b-Chat-4bit](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat-4bit),具体调用方式如下:
|
| 652 |
+
|
| 653 |
+
环境准备:
|
| 654 |
+
```
|
| 655 |
+
pip install git+https://github.com/PanQiWei/AutoGPTQ.git
|
| 656 |
+
```
|
| 657 |
+
|
| 658 |
+
```python
|
| 659 |
+
from transformers import AutoTokenizer
|
| 660 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 661 |
+
model = AutoGPTQForCausalLM.from_quantized('FlagAlpha/Llama2-Chinese-13b-Chat-4bit', device="cuda:0")
|
| 662 |
+
tokenizer = AutoTokenizer.from_pretrained('FlagAlpha/Llama2-Chinese-13b-Chat-4bit',use_fast=False)
|
| 663 |
+
input_ids = tokenizer(['<s>Human: 怎么登上火星\n</s><s>Assistant: '], return_tensors="pt",add_special_tokens=False).input_ids.to('cuda')
|
| 664 |
+
generate_input = {
|
| 665 |
+
"input_ids":input_ids,
|
| 666 |
+
"max_new_tokens":512,
|
| 667 |
+
"do_sample":True,
|
| 668 |
+
"top_k":50,
|
| 669 |
+
"top_p":0.95,
|
| 670 |
+
"temperature":0.3,
|
| 671 |
+
"repetition_penalty":1.3,
|
| 672 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 673 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 674 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 675 |
+
}
|
| 676 |
+
generate_ids = model.generate(**generate_input)
|
| 677 |
+
text = tokenizer.decode(generate_ids[0])
|
| 678 |
+
print(text)
|
| 679 |
+
```
|
| 680 |
+
|
| 681 |
+
## 🚀 部署加速
|
| 682 |
+
随着大模型参数规模的不断增长,在有限的算力资源下,提升模型的推理速度逐渐变为一个重要的研究方向。常用的推理加速框架包含 lmdeploy、TensorRT-LLM、vLLM和JittorLLMs 等。
|
| 683 |
+
|
| 684 |
+
### TensorRT-LLM
|
| 685 |
+
[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)由NVIDIA开发,高性能推理框架
|
| 686 |
+
|
| 687 |
+
详细的推理文档见:[inference-speed/GPU/TensorRT-LLM_example](https://github.com/LlamaFamily/Llama-Chinese/tree/main/inference-speed/GPU/TensorRT-LLM_example)
|
| 688 |
+
|
| 689 |
+
### vLLM
|
| 690 |
+
[vLLM](https://github.com/vllm-project/vllm)由加州大学伯克利分校开发,核心技术是PageAttention,吞吐量比HuggingFace Transformers高出24倍。相较与FasterTrainsformer,vLLM更加的简单易用,不需要额外进行模型的转换,支持fp16推理。
|
| 691 |
+
|
| 692 |
+
详细的推理文档见:[inference-speed/GPU/vllm_example](https://github.com/LlamaFamily/Llama-Chinese/blob/main/inference-speed/GPU/vllm_example/README.md)
|
| 693 |
+
|
| 694 |
+
### JittorLLMs
|
| 695 |
+
[JittorLLMs](https://github.com/Jittor/JittorLLMs)由非十科技领衔,与清华大学可视媒体研究中心合作研发,通过动态swap机制大幅降低硬件配置要求(减少80%),并且Jittor框架通过零拷贝技术,大模型加载相比Pytorch开销降低40%,同时,通过元算子自动编译优化,计算性能提升20%以上。
|
| 696 |
+
|
| 697 |
+
详细的推理文档见:[inference-speed/GPU/JittorLLMs](https://github.com/LlamaFamily/Llama-Chinese/blob/main/inference-speed/GPU/JittorLLMs_example/README.md)
|
| 698 |
+
|
| 699 |
+
### lmdeploy
|
| 700 |
+
[lmdeploy](https://github.com/InternLM/lmdeploy/) 由上海人工智能实验室开发,推理使用 C++/CUDA,对外提供 python/gRPC/http 接口和 WebUI 界���,支持 tensor parallel 分布式推理、支持 fp16/weight int4/kv cache int8 量化。
|
| 701 |
+
|
| 702 |
+
详细的推理文档见:[inference-speed/GPU/lmdeploy_example](https://github.com/LlamaFamily/Llama-Chinese/tree/main/inference-speed/GPU/lmdeploy_example)
|
| 703 |
+
|
| 704 |
+
## 💪 外延能力
|
| 705 |
+
|
| 706 |
+
除了持续增强大模型内在的知识储备、通用理解、逻辑推理和想象能力等,未来,我们也会不断丰富大模型的外延能力,例如知识库检索、计算工具、WolframAlpha、操作软件等。
|
| 707 |
+
我们首先集成了LangChain框架,可以更方便地基于Llama2开发文档检索、问答机器人和智能体应用等,关于LangChain的更多介绍参见[LangChain](https://github.com/langchain-ai/langchain)。
|
| 708 |
+
|
| 709 |
+
### LangChain
|
| 710 |
+
针对LangChain框架封装的Llama2 LLM类见[examples/llama2_for_langchain.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/examples/llama2_for_langchain.py),简单的调用代码示例如下:
|
| 711 |
+
```python
|
| 712 |
+
from llama2_for_langchain import Llama2
|
| 713 |
+
|
| 714 |
+
# 这里以调用FlagAlpha/Atom-7B-Chat为例
|
| 715 |
+
llm = Llama2(model_name_or_path='FlagAlpha/Atom-7B-Chat')
|
| 716 |
+
|
| 717 |
+
while True:
|
| 718 |
+
human_input = input("Human: ")
|
| 719 |
+
response = llm(human_input)
|
| 720 |
+
print(f"Llama2: {response}")
|
| 721 |
+
```
|
| 722 |
+
|
| 723 |
+
## 🥇 模型评测
|
| 724 |
+
|
| 725 |
+
### Llama2和Llama3对比评测
|
| 726 |
+
基础模型对比
|
| 727 |
+
<p align="center" width="100%">
|
| 728 |
+
<img src="./assets/base_eval.png" style="width: 100%; display: block; margin: auto;">
|
| 729 |
+
</p>
|
| 730 |
+
微调模型对比
|
| 731 |
+
<p align="center" width="100%">
|
| 732 |
+
<img src="./assets/tuned_eval.png" style="width: 100%; display: block; margin: auto;">
|
| 733 |
+
</p>
|
| 734 |
+
|
| 735 |
+
### Llama3模型评测
|
| 736 |
+
<p align="center" width="100%">
|
| 737 |
+
<img src="./assets/llama3_eval.png" style="width: 100%; display: block; margin: auto;">
|
| 738 |
+
</p>
|
| 739 |
+
|
| 740 |
+
### Llama2模型评测
|
| 741 |
+
<p align="center" width="100%">
|
| 742 |
+
<img src="./assets/llama_eval.jpeg" style="width: 100%; display: block; margin: auto;">
|
| 743 |
+
</p>
|
| 744 |
+
|
| 745 |
+
为了能够更加清晰地了解Llama2模型的中文问答能力,我们筛选了一些具有代表性的中文问题,对Llama2模型进行提问。我们测试的模型包含Meta公开的Llama2-7B-Chat和Llama2-13B-Chat两个版本,没有做任何微调和训练。测试问题筛选自[AtomBulb](https://github.com/AtomEcho/AtomBulb),共95个测试问题,包含:通用知识、语言理解、创作能力、逻辑推理、代码编程、工作技能、使用工具、人格特征八个大的类别。
|
| 746 |
+
|
| 747 |
+
测试中使用的Prompt如下,例如对于问题“列出5种可以改善睡眠质量的方法”:
|
| 748 |
+
```
|
| 749 |
+
[INST]
|
| 750 |
+
<<SYS>>
|
| 751 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. The answer always been translate into Chinese language.
|
| 752 |
+
|
| 753 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
|
| 754 |
+
|
| 755 |
+
The answer always been translate into Chinese language.
|
| 756 |
+
<</SYS>>
|
| 757 |
+
|
| 758 |
+
列出5种可以改善睡眠质量的方法
|
| 759 |
+
[/INST]
|
| 760 |
+
```
|
| 761 |
+
Llama2-7B-Chat的测试结果见[meta_eval_7B.md](assets/meta_eval_7B.md),Llama2-13B-Chat的测试结果见[meta_eval_13B.md](assets/meta_eval_13B.md)。
|
| 762 |
+
|
| 763 |
+
通过测试我们发现,Meta原始的Llama2 Chat模型对于中文问答的对齐效果一般,大部分情况下都不能给出中文回答,或者是中英文混杂的形式。因此,基于中文数据对Llama2模型进行训练和微调十分必要。
|
| 764 |
+
|
| 765 |
+
|
| 766 |
+
## 📖 学习中心
|
| 767 |
+
|
| 768 |
+
### 官方文档
|
| 769 |
+
Meta Llama全系列模型官方文档:https://llama.meta.com/docs/get-started
|
| 770 |
+
|
| 771 |
+
### Llama3
|
| 772 |
+
[Llama3全套学习资料](https://chinesellama.feishu.cn/wiki/XBKPwbhWriWCfrkmJhfcrS9Rnqc?fromScene=spaceOverview)
|
| 773 |
+
|
| 774 |
+
Llama3官方链接:https://llama.meta.com/llama3
|
| 775 |
+
|
| 776 |
+
### Llama2
|
| 777 |
+
|
| 778 |
+
#### Meta官方对于[Llama2](https://ai.meta.com/llama)的介绍
|
| 779 |
+
自从Meta公司发布第一代LLaMA模型以来,羊驼模型家族繁荣发展。近期Meta发布了Llama2版本,开源可商用,在模型和效果上有了重大更新。Llama2总共公布了7B、13B和70B三种参数大小的模型。相比于LLaMA,Llama2的训练数据达到了2万亿token,上下文长度也由之前的2048升级到4096,可以理解和生成更长的文本。Llama2 Chat模型基于100万人类标记数据微调得到,在英文对话上达到了接近ChatGPT的效果。
|
| 780 |
+
|
| 781 |
+
### Llama相关论文
|
| 782 |
+
* [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
|
| 783 |
+
* [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)
|
| 784 |
+
* [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
## 📌 其它
|
| 788 |
+
|
| 789 |
+
### 🎉 致谢
|
| 790 |
+
|
| 791 |
+
感谢原子回声[AtomEcho](https://github.com/AtomEcho)团队的技术和资源支持!
|
| 792 |
+
|
| 793 |
+
感谢芯格[Coremesh](https://coremesh.net)团队的技���和资源支持!
|
| 794 |
+
|
| 795 |
+
感谢 [福州连天教育科技有限公司](www.3class.cc) 对Llama中文社区的贡献!
|
| 796 |
+
|
| 797 |
+
感谢 @Z Potentials社区对Llama中文社区的支持!
|
| 798 |
+
|
| 799 |
+
### 🤔 问题反馈
|
| 800 |
+
|
| 801 |
+
如有问题,请在GitHub Issue中提交,在提交问题之前,请先查阅以往的issue是否能解决你的问题。
|
| 802 |
+
|
| 803 |
+
礼貌地提出问题,构建和谐的讨论社区。
|
| 804 |
+
|
| 805 |
+
加入[飞书知识库](https://chinesellama.feishu.cn/wiki/space/7257824476874768388?ccm_open_type=lark_wiki_spaceLink),一起共建社区文档。
|
| 806 |
+
|
| 807 |
+
加入微信群讨论😍😍
|
| 808 |
+
|
| 809 |
+
<p align="center" width="100%">
|
| 810 |
+
<img src="./assets/wechat.jpeg" alt="Wechat" style="width: 100%; display: block; margin: auto;">
|
| 811 |
+
</p>
|
| 812 |
+
|
| 813 |
+
<p align="center" width="100%">
|
| 814 |
+
<img src="https://api.star-history.com/svg?repos=LlamaFamily/Llama-Chinese&type=Date" alt="Star" style="width: 100%; display: block; margin: auto;">
|
| 815 |
+
</p>
|
Llama-Chinese/README_EN.md
ADDED
|
@@ -0,0 +1,736 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<p align="left">
|
| 2 |
+
English | <a href="README.md">中文</a>
|
| 3 |
+
</p>
|
| 4 |
+
<br>
|
| 5 |
+
|
| 6 |
+
<h1 align="center">
|
| 7 |
+
Llama-Chinese
|
| 8 |
+
</h1>
|
| 9 |
+
<p align="center" width="100%">
|
| 10 |
+
<img src="assets/llama.png" alt="Llama" style="width: 20%; display: block; margin: auto;"></a>
|
| 11 |
+
</p>
|
| 12 |
+
<p align="center">
|
| 13 |
+
<font face="黑体" color=orange size="6"> The Best Chinese Llama Large Language Model </font>
|
| 14 |
+
</p>
|
| 15 |
+
|
| 16 |
+
<p align="center">
|
| 17 |
+
🤗 <a href="https://huggingface.co/FlagAlpha" target="_blank">Hugging Face</a> • 🤖 <a href="https://www.modelscope.cn/organization/FlagAlpha/" target="_blank">ModelScope</a> • ✡️ <a href="https://wisemodel.cn/models/FlagAlpha/Atom-7B-Chat" target="_blank">WiseModel</a>
|
| 18 |
+
</p>
|
| 19 |
+
|
| 20 |
+
<p align="center">
|
| 21 |
+
<a href="https://llama.family">Online(Including Llama2, Llama3): llama.family</a>
|
| 22 |
+
</p>
|
| 23 |
+
<p align="center">
|
| 24 |
+
<a href="https://huggingface.co/FlagAlpha/Atom-7B-Chat">Open-source Chinese Pre-trained LLM Atom based on Llama2</a>
|
| 25 |
+
</p>
|
| 26 |
+
|
| 27 |
+
</br></br>
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
## 🗂️ Content Guide
|
| 31 |
+
- [📌 Chinese Llama Community](#-chinese-llama-community)
|
| 32 |
+
* [🔥 Community Introduction: Chinese Llama Community](#-community-introduction-chinese-llama-community)
|
| 33 |
+
* [📢 Community Announcements](#-community-announcements)
|
| 34 |
+
* [🤗 LLM Model](#-models-downloads)
|
| 35 |
+
+ [🤗 Pre-trained Chinese Model Atom based on Llama2](#-atom-models)
|
| 36 |
+
+ [🤗 Meta Official Llama2 Model](#meta-official-llama2-models)
|
| 37 |
+
+ [🤗 Fine-tuned Chinese Models based on Llama2](#fine-tuned-chinese-models-based-on-llama2)
|
| 38 |
+
* [🌟 Community Source](#-community-source)
|
| 39 |
+
+ [GPU Source](#-gpu-source)
|
| 40 |
+
+ [Data Source](#-data-source)
|
| 41 |
+
+ [Discussion](#-discussion)
|
| 42 |
+
+ [Product](#-product)
|
| 43 |
+
|
| 44 |
+
- [📌 How to use Llama Model?](#-how-to-use-llama-model)
|
| 45 |
+
* [Setup of Llama3](#setup-of-llama3)
|
| 46 |
+
* [Setup of Llama2](#setup-of-llama2)
|
| 47 |
+
+ [Simple Setup](#simple-setup)
|
| 48 |
+
- [Simple Setup-Anaconda](#simple-setup-anaconda)
|
| 49 |
+
- [Simple Setup-Docker](#simple-setup-docker)
|
| 50 |
+
- [Simple Setup-llama.cpp](#simple-setup-llamacpp)
|
| 51 |
+
- [Simple Setup-gradio](#simple-setup-gradio)
|
| 52 |
+
- [Simple Setup-API](#simple-setup-api)
|
| 53 |
+
+ [🤖 Model Pretraining](#-model-pretraining)
|
| 54 |
+
+ [💡 Model Fine-tuning](#-model-fine-tuning)
|
| 55 |
+
- [Step1: Environment Setup](#step1-environment-setup)
|
| 56 |
+
- [Step2: Data Preparation](#step2-data-preparation)
|
| 57 |
+
- [Step3: Fine-tuning Scripts](#step3-fine-tuning-script)
|
| 58 |
+
- [LoRA Fine-tuning](#lora-fine-tuning)
|
| 59 |
+
- [Full-parameter Fine-tuning](#full-parameter-fine-tuning)
|
| 60 |
+
- [Step4: Load Fine-tuned Model](#step4-load-fine-tuned-model)
|
| 61 |
+
- [LoRA Fine-tuning](#lora-fine-tuning-1)
|
| 62 |
+
- [Full-parameter Fine-tuning](#full-parameter-fine-tuning-1)
|
| 63 |
+
+ [🍄 Model Quantization](#-model-quantization)
|
| 64 |
+
+ [🚀 Inference Acceleration](#-inference-acceleration)
|
| 65 |
+
- [TensorRT-LLM](#tensorrt-llm)
|
| 66 |
+
- [vLLM](#vllm)
|
| 67 |
+
- [JittorLLMs](#jittorllms)
|
| 68 |
+
- [lmdeploy](#lmdeploy)
|
| 69 |
+
+ [💪 Extension Capabilities](#-extension-capabilities)
|
| 70 |
+
- [LangChain](#langchain)
|
| 71 |
+
* [🥇 Model Evaluation](#-model-evaluation)
|
| 72 |
+
* [📖 Learning Resources](#-learning-resources)
|
| 73 |
+
+ [Llama3](#llama3)
|
| 74 |
+
+ [Llama2](#llama2)
|
| 75 |
+
- [Meta Official Introduction to Llama2](#meta-official-introduction-to-llama2)
|
| 76 |
+
- [Llama-related Papers](#llama-related-papers)
|
| 77 |
+
- [Llama2 Evaluation Results](#llama2-evaluation-results)
|
| 78 |
+
|
| 79 |
+
- [📌 Others](#-others)
|
| 80 |
+
* [🎉 Acknowledgments](#-acknowledgments)
|
| 81 |
+
* [🤔 Issue Feedback](#-issue-feedback)
|
| 82 |
+
|
| 83 |
+
## 📌 Chinese Llama Community
|
| 84 |
+
|
| 85 |
+
### 🔥 Community Introduction: Chinese Llama Community
|
| 86 |
+
|
| 87 |
+
Welcome to the Chinese Llama Community! We are a technical community dedicated to optimizing and building on top of the Llama model for Chinese applications.
|
| 88 |
+
**\*Based on large-scale Chinese data, we start pre-training and continuously upgrade the Llama2 model for Chinese capabilities\***.
|
| 89 |
+
We warmly welcome developers and researchers passionate about LLM models to join our community.
|
| 90 |
+
|
| 91 |
+
<details lang="en">
|
| 92 |
+
|
| 93 |
+
#### Why Choose the Chinese Llama Community?
|
| 94 |
+
🚀 **Support from a Team of Senior Engineers**: The community has a team of dedicated NLP senior engineers who provide strong technical support and rich experience to guide and assist you.
|
| 95 |
+
|
| 96 |
+
🎯 **Chinese Optimization**: We focus on optimizing Llama2 for Chinese processing, exploring the best practices for Chinese to enhance its performance and adaptability.
|
| 97 |
+
|
| 98 |
+
💡 **Innovative Exchange**: Our community includes a creative and experienced team of members who organize regular online events, technical discussions, and experience sharing to promote innovative exchanges.
|
| 99 |
+
|
| 100 |
+
🌐 **Global Connectivity**: We welcome developers from around the world to join the community, creating an open and diverse platform for learning and communication.
|
| 101 |
+
|
| 102 |
+
🤝 **Open Sharing**: We encourage community members to open-source and share code and models, promoting collaborative win-win efforts and advancing the development of Chinese NLP technology.
|
| 103 |
+
|
| 104 |
+
#### Community Activities
|
| 105 |
+
🗓️ **Online Lectures**: Inviting industry experts to conduct online lectures, sharing the latest technology and applications of Llama2 in the Chinese NLP field, and discussing cutting-edge research results.
|
| 106 |
+
|
| 107 |
+
💻 **Project Showcase**: Members can showcase their project achievements in Llama2 Chinese optimization, receive feedback and suggestions, and promote project collaboration.
|
| 108 |
+
|
| 109 |
+
📚 **Learning Resources**: The community maintains a rich library of learning materials, including tutorials, documentation, and paper interpretations, providing comprehensive learning support to members.
|
| 110 |
+
|
| 111 |
+
📝 **Paper Interpretation**: Community members collectively interpret the latest research papers related to Llama2, delving into advanced algorithms and methods.
|
| 112 |
+
|
| 113 |
+
🎉 **Themed Events**: Regularly organize various themed events, including challenges, hackathons, and technical salons, allowing community members to exchange and learn in a relaxed and enjoyable atmosphere.
|
| 114 |
+
|
| 115 |
+
🌟 **Reward Program**: We have established a reward program to honor and reward members who actively participate and contribute outstanding work to the community, motivating more outstanding talents to join.
|
| 116 |
+
|
| 117 |
+
📈 **Technical Consultation**: We provide technical consulting services to answer your questions and help you overcome challenges in the development and optimization of Llama2.
|
| 118 |
+
|
| 119 |
+
🚀 **Project Collaboration**: Encourage collaboration between members on projects to explore the potential of Llama2 in practical applications and create innovative solutions.
|
| 120 |
+
|
| 121 |
+
#### Join Us Now!
|
| 122 |
+
📚 **Vision**: Whether you are a professional developer or researcher with experience in Llama2 or a newcomer interested in optimizing Llama2 for Chinese, we eagerly look forward to your joining. In the Chinese Llama Community, you will have the opportunity to exchange ideas with top talents in the industry, work together to advance Chinese NLP technology, and create a brighter technological future!
|
| 123 |
+
|
| 124 |
+
🔗 **Friendly Reminder**: This community is a platform for professional technical exchange. We earnestly hope that like-minded developers and researchers join us. Please adhere to the community guidelines, maintain a positive learning atmosphere, and any content and advertisements unrelated to Llama2 will be removed. Thank you for your understanding and support!
|
| 125 |
+
|
| 126 |
+
</details>
|
| 127 |
+
|
| 128 |
+
### 📢 Community Announcements
|
| 129 |
+
|
| 130 |
+
【Latest】October 8, 2023: Added the inference acceleration feature for JittorLLMs from Tsinghua University [JittorLLMs](#jittorllms)!
|
| 131 |
+
|
| 132 |
+
【Latest】September 12, 2023: Updated pre-training versions [Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B) and dialogue version [Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat) model parameters. The latest Chinese pre-training data size is 100 billion tokens, and the training progress can be viewed at [llama.family](https://llama.family/)!
|
| 133 |
+
|
| 134 |
+
【Latest】September 2, 2023: Added [pre-training code](#-model-pretraining) and [full-parameter fine-tuning code](#-model-fine-tuning)!
|
| 135 |
+
|
| 136 |
+
【Latest】August 28, 2023: Released the open-source large model [Atom-7B](https://huggingface.co/FlagAlpha/Atom-7B) based on Llama2 for Chinese pre-training and will continue to be updated. Details can be found in the [community article](https://mp.weixin.qq.com/s/Bdx0JTVh1kgPn5ydYxIkEw)!
|
| 137 |
+
|
| 138 |
+
【Latest】August 26, 2023: Provided [FastAPI](#fastapi-interface-setup) interface setup script!
|
| 139 |
+
|
| 140 |
+
【Latest】August 26, 2023: Provided a script to convert Meta official model parameters to a format compatible with Hugging Face [Format Conversion Script](https://github.com/LlamaFamily/Llama-Chinese/blob/main/scripts/convert2hf/README.md)!
|
| 141 |
+
|
| 142 |
+
【Latest】August 26, 2023: Added [Code Llama](#-code-model) model!
|
| 143 |
+
|
| 144 |
+
<details lang="en">
|
| 145 |
+
|
| 146 |
+
- August 15, 2023: Added [PEFT load fine-tuning model parameters](#load-fine-tuned-model) code example!
|
| 147 |
+
|
| 148 |
+
- August 14, 2023: Launched the [large model data sharing training platform](https://llama.family), allowing everyone to contribute to large model training, even without computing resources. The data contributed by each community member will determine the future capabilities of the model!
|
| 149 |
+
|
| 150 |
+
- August 3, 2023: Added GPU [inference acceleration](#-inference-acceleration) support for FasterTransformer and vLLM!
|
| 151 |
+
|
| 152 |
+
- July 31, 2023: 【Major】The first truly meaningful Llama2 Chinese large model is released! Details can be found in the [community article](https://mp.weixin.qq.com/s/lExUU7z_MvgJ7tzQPF8tUQ)
|
| 153 |
+
|
| 154 |
+
- July 28, 2023: Deployed a Q&A interface through [Docker](#docker-deployment-of-qa-interface)!
|
| 155 |
+
|
| 156 |
+
- July 27, 2023: Added [LangChain](#langchain) support!
|
| 157 |
+
|
| 158 |
+
- July 26, 2023: Released a [4-bit quantized compressed version](#-model-quantization) of the Llama2-13B Chinese fine-tuning parameters!
|
| 159 |
+
|
| 160 |
+
- July 25, 2023: The community's WeChat public account "Llama Chinese Community" is now live. Feel free to follow for the latest updates and dynamics!
|
| 161 |
+
|
| 162 |
+
- July 24, 2023: [FlagAlpha](https://huggingface.co/FlagAlpha) added Llama2-13B Chinese fine-tuned parameters!
|
| 163 |
+
|
| 164 |
+
- July 24, 2023: [llama.family](https://llama.family/) added Llama2-70B online experience!
|
| 165 |
+
|
| 166 |
+
- July 23, 2023: Released Llama2-13B Chinese fine-tuned parameters to the Hugging Face repository [FlagAlpha](https://huggingface.co/FlagAlpha)!
|
| 167 |
+
|
| 168 |
+
- July 22, 2023: Llama2 online experience link [llama.family](https://llama.family/) is live, including both Meta original and Chinese fine-tuned versions!
|
| 169 |
+
|
| 170 |
+
- July 21, 2023: Evaluated the Chinese Q&A capability of the Meta original Llama2 Chat model [Model Evaluation](#-model-evaluation)!
|
| 171 |
+
|
| 172 |
+
- July 21, 2023: Added the Hugging Face version download link for Llama2 models in China!
|
| 173 |
+
|
| 174 |
+
- July 20, 2023: Added [Feishu Knowledge Base Documentation](https://chinesellama.feishu.cn/wiki/space/7257824476874768388?ccm_open_type=lark_wiki_spaceLink), welcome everyone to contribute!
|
| 175 |
+
|
| 176 |
+
- July 20, 2023: Chinese Llama2 latest download links are live!
|
| 177 |
+
|
| 178 |
+
- July 19, 2023: Officially launched the Llama2 Chinese community, stay tuned for real-time updates!
|
| 179 |
+
|
| 180 |
+
- July 19, 2023: Chinese Llama2 latest download links are in progress, stay tuned!
|
| 181 |
+
|
| 182 |
+
- July 19, 2023: Launched the Llama2 Chinese community, welcome everyone to join!
|
| 183 |
+
|
| 184 |
+
</details>
|
| 185 |
+
|
| 186 |
+
### 🤗 Models Downloads
|
| 187 |
+
|
| 188 |
+
#### 🔵 Atom Models
|
| 189 |
+
|
| 190 |
+
The Atom models, created jointly by the Chinese Llama Community and AtomEcho, rank in the top ten of the Chinese Large Language Model Evaluation List C-Eval (submission on August 21).
|
| 191 |
+
<p align="center" width="100%">
|
| 192 |
+
<img src="./assets/ceval.jpg" alt="ceval" style="width: 100%; display: block; margin: auto;">
|
| 193 |
+
</p>
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
| Category | Model Name | 🤗Model Loading Name | Download Link |
|
| 197 |
+
| --------------- | --------------- | ------------------------------ | ------------------------------------------------------------ |
|
| 198 |
+
| Pretrained | Atom-7B | FlagAlpha/Atom-7B | [HuggingFace](https://huggingface.co/FlagAlpha/Atom-7B) \| [ModelScope](https://modelscope.cn/models/FlagAlpha/Atom-7B) \| [WiseModel](https://wisemodel.cn/models/FlagAlpha/Atom-7B) |
|
| 199 |
+
| Chat | Atom-7B-Chat | FlagAlpha/Atom-7B-Chat | [HuggingFace](https://huggingface.co/FlagAlpha/Atom-7B-Chat) \| [ModelScope](https://modelscope.cn/models/FlagAlpha/Atom-7B-Chat) \| [WiseModel](https://wisemodel.cn/models/FlagAlpha/Atom-7B-Chat) |
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
The Atom series includes Atom-1B, Atom-7B and Atom-13B, with continuous optimization of Chinese language proficiency based on Llama2. Atom-7B and Atom-7B-Chat are fully open source and available for commercial use. You can obtain the models on the [Hugging Face](https://huggingface.co/FlagAlpha) repository. Details are available in [Atom-7B Download](#atom-chinese-pretrained-model-based-on-llama2).
|
| 203 |
+
|
| 204 |
+
Atom models have the following optimizations for Chinese:
|
| 205 |
+
|
| 206 |
+
###### Large-scale Chinese Data Pretraining
|
| 207 |
+
|
| 208 |
+
Atom models are continually pretrained using a large amount of Chinese data, including encyclopedias, books, blogs, news, announcements, novels, financial data, legal data, medical data, code data, professional paper data, and Chinese natural language processing competition datasets. See [📝 Data Sources](#-data-sources) for details.
|
| 209 |
+
|
| 210 |
+
The massive data is filtered, scored, and deduplicated, resulting in high-quality Chinese data exceeding 1T tokens, continuously added to the training iterations.
|
| 211 |
+
|
| 212 |
+
###### More Efficient Chinese Vocabulary
|
| 213 |
+
|
| 214 |
+
To improve the efficiency of Chinese text processing, we optimized the vocabulary of the Llama2 model. First, based on several hundred gigabytes of Chinese text, we expanded the word library to 65,000 words on the basis of the model's vocabulary. Our improvements increased the Chinese encoding/decoding speed by about 350% according to tests. Additionally, we expanded the coverage of the Chinese character set, including all emoji symbols 😊. This makes generating articles with emoji symbols more efficient.
|
| 215 |
+
|
| 216 |
+
###### Adaptive Context Expansion
|
| 217 |
+
|
| 218 |
+
Atom large models support a default context of 4K. Through position interpolation (PI) and Neural Tangent Kernel (NTK) methods, the context length can be expanded to 32K after fine-tuning.
|
| 219 |
+
|
| 220 |
+
###### 📝 Chinese Data
|
| 221 |
+
|
| 222 |
+
We optimized the Chinese capabilities of Llama2 using the following data:
|
| 223 |
+
|
| 224 |
+
| Type | Description |
|
| 225 |
+
| ---------------------------------------------------------- | ------------------------------------------------------------ |
|
| 226 |
+
| Web Data | Publicly available web data on the Internet, selecting deduplicated high-quality Chinese data involving encyclopedias, books, blogs, news, announcements, novels, etc. |
|
| 227 |
+
| [Wikipedia](https://github.com/goldsmith/Wikipedia) | Chinese Wikipedia data |
|
| 228 |
+
| [Wudao](https://github.com/BAAI-WuDao/Model) | 200G of Chinese Wudao open-source data |
|
| 229 |
+
| [Clue](https://github.com/CLUEbenchmark/CLUEDatasetSearch) | High-quality Chinese long-text data cleaned from Clue's open Chinese pretraining data |
|
| 230 |
+
| Competition Datasets | About 150 Chinese natural language processing multi-task competition datasets in recent years |
|
| 231 |
+
| [MNBVC](https://github.com/esbatmop/MNBVC) | Some datasets cleaned from MNBVC |
|
| 232 |
+
|
| 233 |
+
**If you have high-quality datasets, we would greatly appreciate it if you could provide them to us! 💕💕**
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
#### Meta Official Llama2 Models
|
| 237 |
+
|
| 238 |
+
<details lang="en">
|
| 239 |
+
|
| 240 |
+
The Llama2 pretrained models include 7B, 13B, and 70B versions. The Llama2-Chat model is fine-tuned based on the pretrained models and has enhanced conversational capabilities.
|
| 241 |
+
|
| 242 |
+
| Category | Model Name | 🤗Model Loading Name | Download Link |
|
| 243 |
+
| ---------- | ---------- | ------------------------- | --------------------- |
|
| 244 |
+
| Pretrained | Llama2-7B | meta-llama/Llama-2-7b-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-7b-hf) \| [XunLei](https://pan.xunlei.com/s/VN_t0dUikZqOwt-5DZWHuMvqA1?pwd=66ep) |
|
| 245 |
+
| Pretrained | Llama2-13B | meta-llama/Llama-2-13b-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-hf) \| [XunLei](https://pan.xunlei.com/s/VN_yT_9G8xNOz0SDWQ7Mb_GZA1?pwd=yvgf) |
|
| 246 |
+
| Pretrained | Llama2-70B | meta-llama/Llama-2-70b-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-70b-hf) |
|
| 247 |
+
| Chat | Llama2-7B-Chat | meta-llama/Llama-2-7b-chat-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) \| [XunLei](https://pan.xunlei.com/s/VN_oaV4BpKFgKLto4KgOhBcaA1?pwd=ufir) |
|
| 248 |
+
| Chat | Llama2-13B-Chat | meta-llama/Llama-2-13b-chat-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) \| [XunLei](https://pan.xunlei.com/s/VN_yA-9G34NGL9B79b3OQZZGA1?pwd=xqrg) |
|
| 249 |
+
| Chat | Llama2-70B-Chat | meta-llama/Llama-2-70b-chat-hf | [HuggingFace](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) \| [XunLei](https://pan.xunlei.com/s/VNa_vCGzCy3h3N7oeFXs2W1hA1?pwd=uhxh#) |
|
| 250 |
+
| Code | CodeLlama-7b | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/1cIPzdNywWLvQI7_2QanOEQ?pwd=zfwi) |
|
| 251 |
+
| Code | CodeLlama-7b-Python | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/1liY8klGoDagYbpw-g-oFag?pwd=i952) |
|
| 252 |
+
| Code | CodeLlama-7b-Instruct | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/108o9_DT2E_vfSGtOnDCQVw?pwd=zkt9) |
|
| 253 |
+
| Code | CodeLlama-13b | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/1lLaeHv0XEBv0iiZzI1dpnw?pwd=qn99) |
|
| 254 |
+
| Code | CodeLlama-13b-Python | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/1OLVfvZS_oqL3oqMKwsI87w?pwd=a78k) |
|
| 255 |
+
| Code | CodeLlama-13b-Instruct | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/1HyxJl4w8wElgkZRh2ATrXQ?pwd=seg6) |
|
| 256 |
+
| Code | CodeLlama-34b | meta-llama/Llama-2-70b-chat-hf | [XunLei](https://pan.baidu.com/s/1vEw0pFgIkctPUN4_5_6pIQ?pwd=q8eu) |
|
| 257 |
+
|
| 258 |
+
Meta officially released Code Llama on August 24, 2023, which is a fine-tuned version of Llama2 based on code data. It provides three versions with different functionalities: Base Model (Code Llama), Python-specific Model (Code Llama - Python), and Instruction-following Model (Code Llama - Instruct), each available in 7B, 13B, and 34B parameter sizes. The capabilities of different models are summarized in the following table:
|
| 259 |
+
|
| 260 |
+
| Model Category | Model Name | Code Completion | Code Fill | Instruction Programming |
|
| 261 |
+
|-----------------------|------------------------|------|------|------|
|
| 262 |
+
| Code Llama | CodeLlama-7b | ✅ | ✅ | ❌ |
|
| 263 |
+
| | CodeLlama-13b | ✅ | ✅ | ❌ |
|
| 264 |
+
| | CodeLlama-34b | ✅ | ❌ | ❌ |
|
| 265 |
+
| Code Llama - Python | CodeLlama-7b-Python | ✅ | ❌ | ❌ |
|
| 266 |
+
| | CodeLlama-13b-Python | ✅ | ❌ | ❌ |
|
| 267 |
+
| | CodeLlama-34b-Python | ✅ | ❌ | ❌ |
|
| 268 |
+
| Code Llama - Instruct | CodeLlama-7b-Instruct | ❌ | ✅ | ✅ |
|
| 269 |
+
| | CodeLlama-13b-Instruct | ❌ | ✅ | ✅ |
|
| 270 |
+
| | CodeLlama-34b-Instruct | ❌ | ❌ | ✅ |
|
| 271 |
+
|
| 272 |
+
We provide a [domestic download link for Code Llama](#-latest-downloads-of-llama2-in-china) and an online experience link at [llama.family](https://llama.family/). For detailed information on Code Llama, refer to the official GitHub repository [codellama](https://github.com/facebookresearch/codellama).
|
| 273 |
+
|
| 274 |
+
</details>
|
| 275 |
+
|
| 276 |
+
#### Fine-tuned Chinese Models Based on Llama2
|
| 277 |
+
|
| 278 |
+
We fine-tuned the Llama2-Chat model based on a Chinese instruction dataset, enhancing its Chinese conversational abilities. LoRA parameters and merged parameters with the base model have been uploaded to [Hugging Face](https://huggingface.co/FlagAlpha) and currently include models for 7B and 13B.
|
| 279 |
+
|
| 280 |
+
| Category | Model Name | 🤗Model Loading Name | Base Model Version | Download Link |
|
| 281 |
+
| ---------- | ---------- | ------------- | ----------------- | ------------------- |
|
| 282 |
+
| Merged Parameters | Llama2-Chinese-7b-Chat | FlagAlpha/Llama2-Chinese-7b-Chat | meta-llama/Llama-2-7b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-7b-Chat) |
|
| 283 |
+
| Merged Parameters | Llama2-Chinese-13b-Chat | FlagAlpha/Llama2-Chinese-13b-Chat| meta-llama/Llama-2-13b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat) |
|
| 284 |
+
| LoRA Parameters | Llama2-Chinese-7b-Chat-LoRA | FlagAlpha/Llama2-Chinese-7b-Chat-LoRA | meta-llama/Llama-2-7b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-7b-Chat-LoRA) |
|
| 285 |
+
| LoRA Parameters | Llama2-Chinese-13b-Chat-LoRA | FlagAlpha/Llama2-Chinese-13b-Chat-LoRA | meta-llama/Llama-2-13b-chat-hf |[HuggingFace](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat-LoRA) |
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
### 🌟 Community Source
|
| 289 |
+
The abundance of community resources is a crucial guarantee for community development, covering various aspects including but not limited to the following four: computing power, data, forums, and applications. The positive development and full utilization of these aspects will provide community members with more opportunities and support, driving the entire community towards a more prosperous direction. More details in [llama.family](https://llama.family/).
|
| 290 |
+
|
| 291 |
+
<details lang="en">
|
| 292 |
+
#### 💻 GPU Source
|
| 293 |
+
|
| 294 |
+
- Provide computing power resources at below-market prices, usable for various computing tasks such as training and inference of deep learning models.
|
| 295 |
+
- Offer exclusive online inference services for community members, enabling users to quickly and effectively perform inference operations on models.
|
| 296 |
+
- Provide one-click online fine-tuning services, allowing users to conveniently fine-tune models to adapt to different tasks and data.
|
| 297 |
+
|
| 298 |
+
#### 📊 Data Source
|
| 299 |
+
- Open up abundant training data resources covering multiple domains and industries, providing ample data support for model training.
|
| 300 |
+
- Provide high-quality, diverse datasets to meet the needs of different users, supporting data sharing and exchange to facilitate the full utilization of data resources.
|
| 301 |
+
|
| 302 |
+
#### 💬 Discussion
|
| 303 |
+
- The community forum provides a platform for community members to engage in online discussions and exchange technical issues.
|
| 304 |
+
- On the forum, users can share experiences, ask questions, and provide answers, fostering technical exchange and collaboration.
|
| 305 |
+
- The forum can also host regular online events, seminars, etc., to enhance connections and understanding among community members.
|
| 306 |
+
|
| 307 |
+
#### 📱 Product
|
| 308 |
+
- Offer free promotion spaces for showcasing applications, allowing developers to fully present their apps to community members.
|
| 309 |
+
- Provide promotional assistance, including but not limited to publicity campaigns, user guidance, etc., to help apps gain more exposure and users.
|
| 310 |
+
- Through the community platform, provide collaboration opportunities for outstanding apps, promoting cooperation and communication among app developers, collectively driving the development and growth of applications.
|
| 311 |
+
|
| 312 |
+
</details>
|
| 313 |
+
|
| 314 |
+
## 📌 How to use Llama Model?
|
| 315 |
+
|
| 316 |
+
### Setup of Llama3
|
| 317 |
+
|
| 318 |
+
### Setup of Llama2
|
| 319 |
+
|
| 320 |
+
### Simple Setup
|
| 321 |
+
You can choose a learning path to start using the Llama series models. It is recommended to use the Chinese pre-trained dialogue model for better support of Chinese language effects.
|
| 322 |
+
|
| 323 |
+
#### Simple Setup: Anaconda
|
| 324 |
+
|
| 325 |
+
##### Step 0: Prerequisites
|
| 326 |
+
- Make sure that Python version 3.10 or higher is installed.
|
| 327 |
+
|
| 328 |
+
##### Step 1: Prepare the environment
|
| 329 |
+
If you need to set up the environment and install required packages, run the following command.
|
| 330 |
+
```bash
|
| 331 |
+
git clone https://github.com/LlamaFamily/Llama-Chinese.git
|
| 332 |
+
cd Llama-Chinese
|
| 333 |
+
pip install -r requirements.txt
|
| 334 |
+
```
|
| 335 |
+
|
| 336 |
+
##### Step 2: Download Model
|
| 337 |
+
You can download the Atom-7B-Chat model from the following source.
|
| 338 |
+
- [HuggingFace](https://huggingface.co/FlagAlpha)
|
| 339 |
+
- [ModelScope](https://modelscope.cn/organization/FlagAlpha)
|
| 340 |
+
- [WideModel](https://wisemodel.cn/models/FlagAlpha/Atom-7B-Chat)
|
| 341 |
+
|
| 342 |
+
###### Step 3:Infer
|
| 343 |
+
1. Create a file named quick_start.py and copy the following content into the file.
|
| 344 |
+
```python
|
| 345 |
+
import torch
|
| 346 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 347 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 348 |
+
model = AutoModelForCausalLM.from_pretrained('FlagAlpha/Atom-7B-Chat',device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
|
| 349 |
+
model =model.eval()
|
| 350 |
+
tokenizer = AutoTokenizer.from_pretrained('FlagAlpha/Atom-7B-Chat',use_fast=False)
|
| 351 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 352 |
+
input_ids = tokenizer(['<s>Human: 介绍一下中国\n</s><s>Assistant: '], return_tensors="pt",add_special_tokens=False).input_ids
|
| 353 |
+
if torch.cuda.is_available():
|
| 354 |
+
input_ids = input_ids.to('cuda')
|
| 355 |
+
generate_input = {
|
| 356 |
+
"input_ids":input_ids,
|
| 357 |
+
"max_new_tokens":512,
|
| 358 |
+
"do_sample":True,
|
| 359 |
+
"top_k":50,
|
| 360 |
+
"top_p":0.95,
|
| 361 |
+
"temperature":0.3,
|
| 362 |
+
"repetition_penalty":1.3,
|
| 363 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 364 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 365 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 366 |
+
}
|
| 367 |
+
generate_ids = model.generate(**generate_input)
|
| 368 |
+
text = tokenizer.decode(generate_ids[0])
|
| 369 |
+
print(text)
|
| 370 |
+
```
|
| 371 |
+
2. run quick_start.py
|
| 372 |
+
```bash
|
| 373 |
+
python quick_start.py
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
#### Simple Setup: Docker
|
| 378 |
+
For details, refer to: [Docker Deployment](https://github.com/LlamaFamily/Llama-Chinese/blob/main/docs/chat_gradio_guide.md)
|
| 379 |
+
|
| 380 |
+
Step 1: Prepare the Docker image and launch [chat_gradio.py](../examples/chat_gradio.py) through a Docker container.
|
| 381 |
+
```bash
|
| 382 |
+
git clone https://github.com/LlamaFamily/Llama-Chinese.git
|
| 383 |
+
|
| 384 |
+
cd Llama-Chinese
|
| 385 |
+
|
| 386 |
+
docker build -f docker/Dockerfile -t flagalpha/llama2-chinese:gradio .
|
| 387 |
+
```
|
| 388 |
+
|
| 389 |
+
Step 2: Start chat_gradio through Docker-compose.
|
| 390 |
+
```bash
|
| 391 |
+
cd Llama-Chinese/docker
|
| 392 |
+
docker-compose up -d --build
|
| 393 |
+
```
|
| 394 |
+
|
| 395 |
+
#### Simple Setup: llama.cpp
|
| 396 |
+
Details: [llama.cpp](https://github.com/LlamaFamily/Llama-Chinese/blob/main/inference-speed/CPU/ggml/README.md)
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
##### Simple Setup: gradio
|
| 400 |
+
Built on Gradio, the Q&A interface implements fluid output. Copy the following code into the console to run. The code below uses the Atom-7B model as an example, <font color="#006600">simply modify the model name in the code for different models 😊</font><br/>
|
| 401 |
+
|
| 402 |
+
```
|
| 403 |
+
python examples/chat_gradio.py --model_name_or_path FlagAlpha/Atom-7B-Chat
|
| 404 |
+
```
|
| 405 |
+
|
| 406 |
+
##### Simple Setup: API
|
| 407 |
+
Use FastChat to build an inference service interface consistent with OpenAI.
|
| 408 |
+
|
| 409 |
+
<details lang="en">
|
| 410 |
+
|
| 411 |
+
###### step 0:Prerequisites
|
| 412 |
+
Install fastchat
|
| 413 |
+
```bash
|
| 414 |
+
pip3 install "fschat[model_worker,webui]"
|
| 415 |
+
```
|
| 416 |
+
###### step 1:Run Restful API
|
| 417 |
+
Open three terminals and execute the following three commands respectively.
|
| 418 |
+
|
| 419 |
+
- Fist start controler
|
| 420 |
+
```bash
|
| 421 |
+
python3 -m fastchat.serve.controller \
|
| 422 |
+
--host localhost \
|
| 423 |
+
--port 21001
|
| 424 |
+
```
|
| 425 |
+
|
| 426 |
+
- Start Model
|
| 427 |
+
```bash
|
| 428 |
+
CUDA_VISIBLE_DEVICES="0" python3 -m fastchat.serve.model_worker --model-path /path/Atom-7B-Chat \
|
| 429 |
+
--host localhost \
|
| 430 |
+
--port 21002 \
|
| 431 |
+
--worker-address "http://localhost:21002" \
|
| 432 |
+
--limit-worker-concurrency 5 \
|
| 433 |
+
--stream-interval 2 \
|
| 434 |
+
--gpus "1" \
|
| 435 |
+
--load-8bit
|
| 436 |
+
```
|
| 437 |
+
- Start RESTful API Service
|
| 438 |
+
```bash
|
| 439 |
+
python3 -m fastchat.serve.openai_api_server \
|
| 440 |
+
--host localhost \
|
| 441 |
+
--port 21003 \
|
| 442 |
+
--controller-address http://localhost:21001
|
| 443 |
+
```
|
| 444 |
+
|
| 445 |
+
###### step 2:test api service
|
| 446 |
+
Execute the Python code below to test the API service deployed above.
|
| 447 |
+
```python
|
| 448 |
+
# coding=utf-8
|
| 449 |
+
import json
|
| 450 |
+
import time
|
| 451 |
+
import urllib.request
|
| 452 |
+
import sys
|
| 453 |
+
import requests
|
| 454 |
+
|
| 455 |
+
def test_api_server(input_text):
|
| 456 |
+
header = {'Content-Type': 'application/json'}
|
| 457 |
+
|
| 458 |
+
data = {
|
| 459 |
+
"messages": [{"role": "system", "content": ""}, {"role": "user", "content": input_text}],
|
| 460 |
+
"temperature": 0.3,
|
| 461 |
+
"top_p" : 0.95,
|
| 462 |
+
"max_tokens": 512,
|
| 463 |
+
"model": "LLama2-Chinese-13B",
|
| 464 |
+
"stream" : False,
|
| 465 |
+
"n" : 1,
|
| 466 |
+
"best_of": 1,
|
| 467 |
+
"presence_penalty": 1.2,
|
| 468 |
+
"frequency_penalty": 0.2,
|
| 469 |
+
"top_k": 50,
|
| 470 |
+
"use_beam_search": False,
|
| 471 |
+
"stop": [],
|
| 472 |
+
"ignore_eos" :False,
|
| 473 |
+
"logprobs": None
|
| 474 |
+
}
|
| 475 |
+
response = requests.post(
|
| 476 |
+
url='http://127.0.0.1:21003/v1/chat/completions',
|
| 477 |
+
headers=header,
|
| 478 |
+
data=json.dumps(data).encode('utf-8')
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
result = None
|
| 482 |
+
try:
|
| 483 |
+
result = json.loads(response.content)
|
| 484 |
+
print(json.dumps(data, ensure_ascii=False, indent=2))
|
| 485 |
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
| 486 |
+
|
| 487 |
+
except Exception as e:
|
| 488 |
+
print(e)
|
| 489 |
+
|
| 490 |
+
return result
|
| 491 |
+
|
| 492 |
+
if __name__ == "__main__":
|
| 493 |
+
test_api_server("如何去北京?")
|
| 494 |
+
```
|
| 495 |
+
</details>
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
#### 🤖 Model Pretraining
|
| 499 |
+
While the pretraining data for Llama2 has doubled compared to the first generation LLaMA, the proportion of Chinese pretraining data is still very low, accounting for only 0.13%. This results in a relatively weak Chinese capability for the original Llama2. To enhance the model's Chinese capability, two approaches can be adopted: fine-tuning and pretraining.
|
| 500 |
+
|
| 501 |
+
- Fine-tuning requires fewer computational resources and can quickly create a prototype of a Chinese Llama. However, its drawback is evident – it can only leverage the existing Chinese capabilities of the base model. Due to the limited amount of Chinese training data for Llama2, the potential improvement is also restricted, addressing the symptoms rather than the root cause.
|
| 502 |
+
|
| 503 |
+
- Pretraining based on large-scale Chinese corpora involves high costs, requiring not only large-scale high-quality Chinese data but also substantial computational resources. However, the advantage is clear – it optimizes the Chinese capability from the model's foundational layers, achieving a fundamental improvement, injecting robust Chinese capabilities into the core of the large model.
|
| 504 |
+
|
| 505 |
+
We provide the pretraining code for the Llama model to the community, along with [Chinese test data](https://github.com/LlamaFamily/Llama-Chinese/tree/main/data). More data can be found in [Chinese Data](#-chinese-data). The specific code and configurations are as follows:
|
| 506 |
+
|
| 507 |
+
- Model pretraining script: [train/pretrain/pretrain.sh](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/pretrain.sh)
|
| 508 |
+
- Pretraining implementation code: [train/pretrain/pretrain_clm.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/pretrain_clm.py)
|
| 509 |
+
- [DeepSpeed](https://github.com/microsoft/DeepSpeed) acceleration:
|
| 510 |
+
- For single-card training, ZeRO-2 can be used. See parameters in [train/pretrain/ds_config_zero2.json](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/ds_config_zero2.json).
|
| 511 |
+
- For multi-card training, ZeRO-3 can be used. See parameters in [train/pretrain/ds_config_zero3.json](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/ds_config_zero3.json).
|
| 512 |
+
- Training effectiveness metrics: [train/pretrain/accuracy.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/pretrain/accuracy.py)
|
| 513 |
+
|
| 514 |
+
#### 💡 Model Fine-Tuning
|
| 515 |
+
|
| 516 |
+
This repository provides both LoRA fine-tuning and full-parameter fine-tuning code. Detailed information about LoRA can be found in the paper "[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)" and the Microsoft GitHub repository [LoRA](https://github.com/microsoft/LoRA).
|
| 517 |
+
|
| 518 |
+
##### Step1: Environment Setup
|
| 519 |
+
|
| 520 |
+
Install the necessary environment dependencies according to [requirements.txt](https://github.com/LlamaFamily/Llama-Chinese/blob/main/requirements.txt).
|
| 521 |
+
|
| 522 |
+
##### Step2: Data Preparation
|
| 523 |
+
|
| 524 |
+
In the data directory, there is a sample data for the model's SFT:
|
| 525 |
+
- Training data: [data/train_sft.csv](https://github.com/LlamaFamily/Llama-Chinese/blob/main/data/train_sft.csv)
|
| 526 |
+
- Validation data: [data/dev_sft.csv](https://github.com/LlamaFamily/Llama-Chinese/blob/main/data/dev_sft.csv)
|
| 527 |
+
|
| 528 |
+
Each CSV file contains a "text" column, with each row representing a training example. Organize questions and answers in the model's input format, as shown below:
|
| 529 |
+
```
|
| 530 |
+
"<s>Human: "+question+"\n</s><s>Assistant: "+answer
|
| 531 |
+
```
|
| 532 |
+
For example,
|
| 533 |
+
```
|
| 534 |
+
<s>Human: Describe why the Earth is unique in one sentence.</s><s>Assistant: Because the Earth is currently the only known planet with existing life.</s>
|
| 535 |
+
```
|
| 536 |
+
|
| 537 |
+
##### Step3: Fine-tuning Scripts
|
| 538 |
+
|
| 539 |
+
###### LoRA Fine-tuning
|
| 540 |
+
LoRA fine-tuning script: [train/sft/finetune_lora.sh](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune_lora.sh). For details on LoRA fine-tuning implementation, refer to [train/sft/finetune_clm_lora.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune_clm_lora.py). Fine-tuning on a single machine with multiple cards can be achieved by modifying the "--include localhost:0" in the script.
|
| 541 |
+
|
| 542 |
+
###### Full-parameter Fine-tuning
|
| 543 |
+
Full-parameter fine-tuning script: [train/sft/finetune.sh](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune.sh). For details on full-parameter fine-tuning implementation, refer to [train/sft/finetune_clm.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/train/sft/finetune_clm.py).
|
| 544 |
+
|
| 545 |
+
###### Step4: Load Fine-tuned Model
|
| 546 |
+
|
| 547 |
+
###### LoRA Fine-tuning
|
| 548 |
+
For LoRA fine-tuned model parameters, see [Chinese Fine-Tuned Model based on Llama2](#chinese-fine-tuned-model-based-on-llama2). LoRA parameters need to be combined with base model parameters.
|
| 549 |
+
|
| 550 |
+
Use [PEFT](https://github.com/huggingface/peft) to load both pretraining and fine-tuned model parameters. In the example code below, set "base_model_name_or_path" to the pretraining model's save path and "finetune_model_path" to the fine-tuned model's save path.
|
| 551 |
+
|
| 552 |
+
```python
|
| 553 |
+
import torch
|
| 554 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 555 |
+
from peft import PeftModel, PeftConfig
|
| 556 |
+
|
| 557 |
+
finetune_model_path = '' # For example: 'FlagAlpha/Llama2-Chinese-7b-Chat-LoRA'
|
| 558 |
+
config = PeftConfig.from_pretrained(finetune_model_path)
|
| 559 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, use_fast=False)
|
| 560 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 561 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 562 |
+
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map=device_map, torch_dtype=torch.float16, load_in_8bit=True)
|
| 563 |
+
model = PeftModel.from_pretrained(model, finetune_model_path, device_map={"": 0})
|
| 564 |
+
model = model.eval()
|
| 565 |
+
input_ids = tokenizer(['<s>Human: Introduce Beijing\n</s><s>Assistant: '], return_tensors="pt", add_special_tokens=False).input_ids
|
| 566 |
+
if torch.cuda.is_available():
|
| 567 |
+
input_ids = input_ids.to('cuda')
|
| 568 |
+
generate_input = {
|
| 569 |
+
"input_ids": input_ids,
|
| 570 |
+
"max_new_tokens": 512,
|
| 571 |
+
"do_sample": True,
|
| 572 |
+
"top_k": 50,
|
| 573 |
+
"top_p": 0.95,
|
| 574 |
+
"temperature": 0.3,
|
| 575 |
+
"repetition_penalty": 1.3,
|
| 576 |
+
"eos_token_id": tokenizer.eos_token_id,
|
| 577 |
+
"bos_token_id": tokenizer.bos_token_id,
|
| 578 |
+
"pad_token_id": tokenizer.pad_token_id
|
| 579 |
+
}
|
| 580 |
+
generate_ids = model.generate(**generate_input)
|
| 581 |
+
text = tokenizer.decode(generate_ids[0])
|
| 582 |
+
print(text)
|
| 583 |
+
```
|
| 584 |
+
###### Full-parameter Fine-tuning
|
| 585 |
+
For full-parameter fine-tuned models, use the same calling method as in Model Calling Code Example, just modify the model name or save path accordingly.
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
#### 🍄 Model Quantization
|
| 590 |
+
We have quantized the parameters of the Chinese fine-tuned model to facilitate running with fewer computational resources. Currently, we have uploaded a 4-bit compressed version of the 13B Chinese fine-tuned model [FlagAlpha/Llama2-Chinese-13b-Chat](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat) as [FlagAlpha/Llama2-Chinese-13b-Chat-4bit](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat-4bit) on [Hugging Face](https://huggingface.co/FlagAlpha). The specific calling method is as follows:
|
| 591 |
+
|
| 592 |
+
Environmental requirements:
|
| 593 |
+
```
|
| 594 |
+
pip install git+https://github.com/PanQiWei/AutoGPTQ.git
|
| 595 |
+
```
|
| 596 |
+
|
| 597 |
+
```python
|
| 598 |
+
from transformers import AutoTokenizer
|
| 599 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 600 |
+
|
| 601 |
+
model = AutoGPTQForCausalLM.from_quantized('FlagAlpha/Llama2-Chinese-13b-Chat-4bit', device="cuda:0")
|
| 602 |
+
tokenizer = AutoTokenizer.from_pretrained('FlagAlpha/Llama2-Chinese-13b-Chat-4bit', use_fast=False)
|
| 603 |
+
input_ids = tokenizer(['<s>Human: How to land on Mars\n</s><s>Assistant: '], return_tensors="pt", add_special_tokens=False).input_ids.to('cuda')
|
| 604 |
+
generate_input = {
|
| 605 |
+
"input_ids": input_ids,
|
| 606 |
+
"max_new_tokens": 512,
|
| 607 |
+
"do_sample": True,
|
| 608 |
+
"top_k": 50,
|
| 609 |
+
"top_p": 0.95,
|
| 610 |
+
"temperature": 0.3,
|
| 611 |
+
"repetition_penalty": 1.3,
|
| 612 |
+
"eos_token_id": tokenizer.eos_token_id,
|
| 613 |
+
"bos_token_id": tokenizer.bos_token_id,
|
| 614 |
+
"pad_token_id": tokenizer.pad_token_id
|
| 615 |
+
}
|
| 616 |
+
generate_ids = model.generate(**generate_input)
|
| 617 |
+
text = tokenizer.decode(generate_ids[0])
|
| 618 |
+
print(text)
|
| 619 |
+
```
|
| 620 |
+
|
| 621 |
+
#### 🚀 Inference Acceleration
|
| 622 |
+
As the parameter scale of large models continues to grow, improving model inference speed has become an important research direction with limited computational resources. Common inference acceleration frameworks include lmdeploy, FasterTransformer, vLLM, and JittorLLMs.
|
| 623 |
+
|
| 624 |
+
##### TensorRT-LLM
|
| 625 |
+
[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main) is developed by NVIDIA, written in C++/CUDA, and supports distributed inference.
|
| 626 |
+
|
| 627 |
+
For detailed inference documentation, visit: [inference-speed/GPU/TensorRT-LLM_example](https://github.com/LlamaFamily/Llama-Chinese/tree/main/inference-speed/GPU/TensorRT-LLM_example)
|
| 628 |
+
|
| 629 |
+
##### vLLM
|
| 630 |
+
[vLLM](https://github.com/vllm-project/vllm) is developed by the University of California, Berkeley, with its core technology being PageAttention. It achieves 24 times higher throughput compared to HuggingFace Transformers. Unlike FasterTransformer, vLLM is more user-friendly and does not require additional model conversion. It supports FP16 inference.
|
| 631 |
+
|
| 632 |
+
For detailed inference documentation, visit: [inference-speed/GPU/vllm_example](https://github.com/LlamaFamily/Llama-Chinese/blob/main/inference-speed/GPU/vllm_example/README.md)
|
| 633 |
+
|
| 634 |
+
##### JittorLLMs
|
| 635 |
+
[JittorLLMs](https://github.com/Jittor/JittorLLMs) is led by Non-ten Technology in collaboration with the Visual Media Research Center at Tsinghua University. It significantly reduces hardware requirements by 80% through a dynamic swap mechanism. Jittor framework, with zero-copy technology, reduces the loading overhead of large models by 40% compared to PyTorch. Moreover, automatic compilation optimization through meta-operators enhances computational performance by over 20%.
|
| 636 |
+
|
| 637 |
+
For detailed inference documentation, visit: [inference-speed/GPU/JittorLLMs](https://github.com/LlamaFamily/Llama-Chinese/blob/main/inference-speed/GPU/JittorLLMs_example/README.md)
|
| 638 |
+
|
| 639 |
+
##### lmdeploy
|
| 640 |
+
[lmdeploy](https://github.com/InternLM/lmdeploy/) is developed by the Shanghai AI Lab, using C++/CUDA for inference. It provides Python/gRPC/HTTP interfaces and a WebUI for inference, supporting tensor parallel distributed inference and FP16/weight int4/kv cache int8 quantization.
|
| 641 |
+
|
| 642 |
+
For detailed inference documentation, visit: [inference-speed/GPU/lmdeploy_example](https://github.com/LlamaFamily/Llama-Chinese/tree/main/inference-speed/GPU/lmdeploy_example)
|
| 643 |
+
|
| 644 |
+
#### 💪 Extension Capabilities
|
| 645 |
+
|
| 646 |
+
In addition to continually enhancing the intrinsic qualities of large models, such as knowledge base, general understanding, logical reasoning, and imaginative capabilities, we are also actively expanding the extension capabilities of the large models. This includes features like knowledge base retrieval, computational tools, WolframAlpha integration, and software manipulation.
|
| 647 |
+
|
| 648 |
+
We have initially integrated the LangChain framework to facilitate the development of applications like document retrieval, question-answering bots, and intelligent agents based on the Llama2 model. For more information on LangChain, please refer to [LangChain](https://github.com/langchain-ai/langchain).
|
| 649 |
+
|
| 650 |
+
##### LangChain
|
| 651 |
+
For a simplified implementation using the LangChain framework with the Llama2 LLM class, refer to [examples/llama2_for_langchain.py](https://github.com/LlamaFamily/Llama-Chinese/blob/main/examples/llama2_for_langchain.py). Here is a basic code snippet:
|
| 652 |
+
|
| 653 |
+
```python
|
| 654 |
+
from llama2_for_langchain import Llama2
|
| 655 |
+
|
| 656 |
+
# Example using FlagAlpha/Atom-7B-Chat
|
| 657 |
+
llm = Llama2(model_name_or_path='FlagAlpha/Atom-7B-Chat')
|
| 658 |
+
|
| 659 |
+
while True:
|
| 660 |
+
human_input = input("Human: ")
|
| 661 |
+
response = llm(human_input)
|
| 662 |
+
print(f"Llama2: {response}")
|
| 663 |
+
```
|
| 664 |
+
|
| 665 |
+
|
| 666 |
+
##### 🥇 Model Evaluation
|
| 667 |
+
|
| 668 |
+
###### Llama3 Evaluation
|
| 669 |
+
|
| 670 |
+
###### Llama2 Evaluation
|
| 671 |
+
|
| 672 |
+
<p align="center" width="100%">
|
| 673 |
+
<img src="./assets/llama_eval.jpeg" style="width: 100%; display: block; margin: auto;">
|
| 674 |
+
</p>
|
| 675 |
+
|
| 676 |
+
To gain a clearer understanding of the Chinese question-answering capabilities of the Llama2 model, we selected a set of representative Chinese questions for testing. The tested models include Meta's publicly available versions, namely, Llama2-7B-Chat and Llama2-13B-Chat, without any fine-tuning or training. The test questions were curated from [AtomBulb](https://github.com/AtomEcho/AtomBulb), totaling 95 questions covering eight major categories: general knowledge, language understanding, creative ability, logical reasoning, code programming, work skills, tool usage, and personality traits.
|
| 677 |
+
|
| 678 |
+
The prompt used during testing is as follows, for example, for the question "List 5 methods to improve sleep quality":
|
| 679 |
+
|
| 680 |
+
```plaintext
|
| 681 |
+
[INST]
|
| 682 |
+
<<SYS>>
|
| 683 |
+
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. The answer always been translate into Chinese language.
|
| 684 |
+
|
| 685 |
+
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
|
| 686 |
+
|
| 687 |
+
The answer always been translate into Chinese language.
|
| 688 |
+
<</SYS>>
|
| 689 |
+
|
| 690 |
+
List 5 methods to improve sleep quality
|
| 691 |
+
[/INST]
|
| 692 |
+
```
|
| 693 |
+
The test results for Llama2-7B-Chat can be found at[meta_eval_7B.md](assets/meta_eval_7B.md),and for Llama2-13B-Chat at [meta_eval_13B.md](assets/meta_eval_13B.md)。
|
| 694 |
+
|
| 695 |
+
Through our testing, we observed that Meta's original Llama2 Chat model generally has mediocre alignment with Chinese questions. In most cases, it fails to provide Chinese answers, or the responses are a mixture of Chinese and English. Therefore, it is crucial to train and fine-tune the Llama2 model on Chinese data. Our Chinese version of the Llama2 model is currently undergoing training and will be made available to the community in the near future.
|
| 696 |
+
|
| 697 |
+
|
| 698 |
+
#### 📖 Learning Resources
|
| 699 |
+
|
| 700 |
+
##### Llama3
|
| 701 |
+
|
| 702 |
+
##### Llama2
|
| 703 |
+
###### Meta Official Introduction to [Llama2](https://ai.meta.com/llama)
|
| 704 |
+
Since the release of Meta's first-generation LLaMA model, the Llama model family has thrived. Recently, Meta released the Llama2 version, which is open-source and commercially available, with significant updates in model and performance. Llama2 has models with parameter sizes of 7B, 13B, and 70B. Compared to LLaMA, Llama2's training data has reached 20 trillion tokens, and the context length has been upgraded from the previous 2048 to 4096, allowing it to understand and generate longer text. The Llama2 Chat model, fine-tuned based on 1 million human-labeled data, achieves results close to ChatGPT in English conversations.
|
| 705 |
+
|
| 706 |
+
###### Llama-related Papers
|
| 707 |
+
* [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
|
| 708 |
+
* [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288)
|
| 709 |
+
* [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/)
|
| 710 |
+
|
| 711 |
+
## 📌 Others
|
| 712 |
+
|
| 713 |
+
## 🎉 Acknowledgments
|
| 714 |
+
Special thanks to the AtomEcho team for their technical and resource support!
|
| 715 |
+
|
| 716 |
+
Thanks to @xzsGenius for contributions to the Llama2 Chinese community!
|
| 717 |
+
|
| 718 |
+
Thanks to the Z-Potentials community for supporting the Llama2 Chinese community!
|
| 719 |
+
|
| 720 |
+
## 🤔 Issue Feedback
|
| 721 |
+
If you have any issues, please submit them in the GitHub Issues. Before submitting a new issue, please check existing issues to see if your problem has already been addressed.
|
| 722 |
+
|
| 723 |
+
Please be polite when raising issues and contribute to building a harmonious discussion community.
|
| 724 |
+
|
| 725 |
+
Join the [Feishu Knowledge Base](https://chinesellama.feishu.cn/wiki/space/7257824476874768388?ccm_open_type=lark_wiki_spaceLink) to collaboratively build community documentation.
|
| 726 |
+
|
| 727 |
+
Join the WeChat group for discussions 😍😍
|
| 728 |
+
|
| 729 |
+
|
| 730 |
+
<p align="center" width="100%">
|
| 731 |
+
<img src="./assets/wechat.jpeg" alt="Wechat" style="width: 100%; display: block; margin: auto;">
|
| 732 |
+
</p>
|
| 733 |
+
|
| 734 |
+
<p align="center" width="100%">
|
| 735 |
+
<img src="https://api.star-history.com/svg?repos=LlamaFamily/Llama-Chinese&type=Date" alt="Wechat" style="width: 100%; display: block; margin: auto;">
|
| 736 |
+
</p>
|
Llama-Chinese/assets/base_eval.png
ADDED
|
Llama-Chinese/assets/ceval.jpg
ADDED
|
Llama-Chinese/assets/llama.jpg
ADDED
|
Llama-Chinese/assets/llama.png
ADDED
|
Llama-Chinese/assets/llama2-chinese-webui.jpg
ADDED
|
Llama-Chinese/assets/llama3_eval.png
ADDED
|
Llama-Chinese/assets/llama_eval.jpeg
ADDED
|
Llama-Chinese/assets/meta_eval_13B.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Llama-Chinese/assets/meta_eval_7B.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Llama-Chinese/assets/tuned_eval.png
ADDED
|
Llama-Chinese/assets/wechat.jpeg
ADDED
|
Llama-Chinese/data/dev_sft.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Llama-Chinese/data/dev_sft_sharegpt.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Llama-Chinese/data/train_sft.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Llama-Chinese/docker/Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 使用pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel作为基础镜像
|
| 2 |
+
FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel
|
| 3 |
+
|
| 4 |
+
RUN apt-get update -y --allow-unauthenticated
|
| 5 |
+
RUN apt install -y git vim git-lfs
|
| 6 |
+
|
| 7 |
+
#设置工作目录
|
| 8 |
+
WORKDIR /root/Llama-Chinese
|
| 9 |
+
|
| 10 |
+
# 从git上克隆llama-chinese仓库
|
| 11 |
+
RUN git clone https://github.com/LlamaFamily/Llama-Chinese.git /root/Llama-Chinese
|
| 12 |
+
|
| 13 |
+
# tsinghua source
|
| 14 |
+
RUN mkdir -p ~/.pip
|
| 15 |
+
RUN echo "[global]\nindex-url = https://pypi.tuna.tsinghua.edu.cn/simple" > ~/.pip/pip.conf
|
| 16 |
+
|
| 17 |
+
# 使用pip安装requirements.txt
|
| 18 |
+
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn -r requirements.txt
|
| 19 |
+
|
| 20 |
+
#克隆Hugging Face仓库
|
| 21 |
+
RUN git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat
|
| 22 |
+
|
| 23 |
+
#开启7860端口
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
|
| 26 |
+
#设置启动命令
|
| 27 |
+
ENTRYPOINT ["python", "examples/chat_gradio.py", "--model_name_or_path", "/root/Llama-Chinese/Atom-7B-Chat/"]
|
Llama-Chinese/docker/Dockerfile_train
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-devel as builder
|
| 2 |
+
RUN apt-get update -y --allow-unauthenticated
|
| 3 |
+
RUN apt install git tmux htop vim -y
|
| 4 |
+
RUN pip install bitsandbytes -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
|
| 5 |
+
RUN pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
|
| 6 |
+
RUN pip install peft -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
|
| 7 |
+
RUN pip install accelerate -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
|
| 8 |
+
RUN pip install deepspeed -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
|
| 9 |
+
RUN pip install scipy sentencepiece datasets joblib sentence_transformers cn2an evaluate tensorboard wandb -i https://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
|
Llama-Chinese/docker/docker-compose.yml
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.7'
|
| 2 |
+
services:
|
| 3 |
+
app:
|
| 4 |
+
image: flagalpha/llama2-chinese:gradio # 这里替换为你实际的镜像名
|
| 5 |
+
volumes:
|
| 6 |
+
- /usr/local/nvidia:/usr/local/nvidia # 让容器访问主机的NVIDIA驱动
|
| 7 |
+
environment:
|
| 8 |
+
- NVIDIA_VISIBLE_DEVICES=all # 让容器可以访问所有的NVIDIA GPU
|
| 9 |
+
ports:
|
| 10 |
+
- 7860:7860 # 在容器和主机之间映射端口
|
| 11 |
+
deploy:
|
| 12 |
+
resources:
|
| 13 |
+
reservations:
|
| 14 |
+
devices:
|
| 15 |
+
- driver: nvidia
|
| 16 |
+
capabilities: [gpu] # 使用Docker的设备请求来让容器使用GPU
|
Llama-Chinese/docs/chat_gradio_guide.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Docker环境执行chat_gradio.py
|
| 2 |
+
|
| 3 |
+
系统需要准备的环境
|
| 4 |
+
|
| 5 |
+
+ docker: 24.0.2
|
| 6 |
+
+ docker-compose
|
| 7 |
+
|
| 8 |
+
## 第一步. 准备Docker镜像
|
| 9 |
+
|
| 10 |
+
通过docker镜像可以更方便的管理需要安装的环境依赖。所以这里可以直接通过docker容器启动[chat_gradio](../examples/chat_gradio.py), 第一步准备镜像环境。
|
| 11 |
+
|
| 12 |
+
```bash
|
| 13 |
+
git clone https://github.com/LlamaFamily/Llama-Chinese.git
|
| 14 |
+
|
| 15 |
+
cd Llama-Chinese
|
| 16 |
+
|
| 17 |
+
docker build -f docker/Dockerfile -t FlagAlpha/llama2-chinese:gradio .
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## 第二步. 通过docker-compose启动chat_gradio
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
```bash
|
| 24 |
+
cd Llama-Chinese/docker
|
| 25 |
+
doker-compose up -d --build
|
| 26 |
+
```
|
Llama-Chinese/docs/inference_speed_guide.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 推理部署
|
| 2 |
+
|
| 3 |
+
> 训练完之后或者经过微调之后的模型或者直接从[huggingface](https://huggingface.co/FlagAlpha)下载的模型,都需要部署使用。部署也就是指的模型推理,如果直接使用原生的trainsfomers进行部署,速度会比较慢。针对推理有多种加速手段,会带来较快的推理速度。
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
## 1. GPU推理方案
|
| 8 |
+
|
| 9 |
+
### 方案一:vllm
|
| 10 |
+
|
| 11 |
+
[使用说明](../inference-speed/GPU/vllm_example/README.md)
|
| 12 |
+
|
| 13 |
+
### 方案二:TensorRT-LLM
|
| 14 |
+
|
| 15 |
+
[使用说明](../inference-speed/GPU/TensorRT-LLM_example/README.md)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## 2. CPU 推理方案
|
| 19 |
+
|
| 20 |
+
### 方案一:ggml
|
| 21 |
+
[使用说明](../inference-speed/CPU/ggml/README.md)
|
Llama-Chinese/examples/chat_gradio.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer
|
| 4 |
+
from threading import Thread
|
| 5 |
+
import torch,sys,os
|
| 6 |
+
import json
|
| 7 |
+
import pandas
|
| 8 |
+
import argparse
|
| 9 |
+
|
| 10 |
+
with gr.Blocks() as demo:
|
| 11 |
+
gr.Markdown("""<h1><center>智能助手</center></h1>""")
|
| 12 |
+
chatbot = gr.Chatbot()
|
| 13 |
+
msg = gr.Textbox()
|
| 14 |
+
state = gr.State()
|
| 15 |
+
with gr.Row():
|
| 16 |
+
clear = gr.Button("新话题")
|
| 17 |
+
re_generate = gr.Button("重新回答")
|
| 18 |
+
sent_bt = gr.Button("发送")
|
| 19 |
+
with gr.Accordion("生成参数", open=False):
|
| 20 |
+
slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3)
|
| 21 |
+
slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95)
|
| 22 |
+
slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0)
|
| 23 |
+
def user(user_message, history):
|
| 24 |
+
return "", history + [[user_message, None]]
|
| 25 |
+
def bot(history,temperature,top_p,slider_context_times):
|
| 26 |
+
if pandas.isnull(history[-1][1])==False:
|
| 27 |
+
history[-1][1] = None
|
| 28 |
+
yield history
|
| 29 |
+
slider_context_times = int(slider_context_times)
|
| 30 |
+
history_true = history[1:-1]
|
| 31 |
+
prompt = ''
|
| 32 |
+
if slider_context_times>0:
|
| 33 |
+
prompt += '\n'.join([("<s>Human: "+one_chat[0].replace('<br>','\n')+'\n</s>' if one_chat[0] else '') +"<s>Assistant: "+one_chat[1].replace('<br>','\n')+'\n</s>' for one_chat in history_true[-slider_context_times:] ])
|
| 34 |
+
prompt += "<s>Human: "+history[-1][0].replace('<br>','\n')+"\n</s><s>Assistant:"
|
| 35 |
+
input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda')
|
| 36 |
+
generate_input = {
|
| 37 |
+
"input_ids":input_ids,
|
| 38 |
+
"max_new_tokens":512,
|
| 39 |
+
"do_sample":True,
|
| 40 |
+
"top_k":50,
|
| 41 |
+
"top_p":top_p,
|
| 42 |
+
"temperature":temperature,
|
| 43 |
+
"repetition_penalty":1.3,
|
| 44 |
+
"streamer":streamer,
|
| 45 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 46 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 47 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 48 |
+
}
|
| 49 |
+
thread = Thread(target=model.generate, kwargs=generate_input)
|
| 50 |
+
thread.start()
|
| 51 |
+
start_time = time.time()
|
| 52 |
+
bot_message =''
|
| 53 |
+
print('Human:',history[-1][0])
|
| 54 |
+
print('Assistant: ',end='',flush=True)
|
| 55 |
+
for new_text in streamer:
|
| 56 |
+
print(new_text,end='',flush=True)
|
| 57 |
+
if len(new_text)==0:
|
| 58 |
+
continue
|
| 59 |
+
if new_text!='</s>':
|
| 60 |
+
bot_message+=new_text
|
| 61 |
+
if 'Human:' in bot_message:
|
| 62 |
+
bot_message = bot_message.split('Human:')[0]
|
| 63 |
+
history[-1][1] = bot_message
|
| 64 |
+
yield history
|
| 65 |
+
end_time =time.time()
|
| 66 |
+
print()
|
| 67 |
+
print('生成耗时:',end_time-start_time,'文字长度:',len(bot_message),'字耗时:',(end_time-start_time)/len(bot_message))
|
| 68 |
+
|
| 69 |
+
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
| 70 |
+
bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
|
| 71 |
+
)
|
| 72 |
+
sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
| 73 |
+
bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
|
| 74 |
+
)
|
| 75 |
+
re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot )
|
| 76 |
+
clear.click(lambda: [], None, chatbot, queue=False)
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
parser = argparse.ArgumentParser()
|
| 80 |
+
parser.add_argument("--model_name_or_path", type=str, help='mode name or path')
|
| 81 |
+
parser.add_argument("--is_4bit", action='store_true', help='use 4bit model')
|
| 82 |
+
args = parser.parse_args()
|
| 83 |
+
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False)
|
| 84 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 85 |
+
if args.is_4bit==False:
|
| 86 |
+
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path,
|
| 87 |
+
device_map='cuda:0' if torch.cuda.is_available() else "auto",
|
| 88 |
+
torch_dtype=torch.float16,
|
| 89 |
+
load_in_8bit=True,
|
| 90 |
+
trust_remote_code=True,
|
| 91 |
+
use_flash_attention_2=True)
|
| 92 |
+
model.eval()
|
| 93 |
+
else:
|
| 94 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 95 |
+
model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
|
| 96 |
+
streamer = TextIteratorStreamer(tokenizer,skip_prompt=True)
|
| 97 |
+
if torch.__version__ >= "2" and sys.platform != "win32":
|
| 98 |
+
model = torch.compile(model)
|
| 99 |
+
demo.queue().launch(share=False, debug=True,server_name="0.0.0.0")
|
Llama-Chinese/examples/chat_gradio_no_merge.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import time
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM,TextIteratorStreamer
|
| 4 |
+
from threading import Thread
|
| 5 |
+
from peft import PeftModel,PeftConfig
|
| 6 |
+
import torch,sys,os
|
| 7 |
+
import json
|
| 8 |
+
import pandas
|
| 9 |
+
import argparse
|
| 10 |
+
|
| 11 |
+
with gr.Blocks() as demo:
|
| 12 |
+
gr.Markdown("""<h1><center>智能助手</center></h1>""")
|
| 13 |
+
chatbot = gr.Chatbot()
|
| 14 |
+
msg = gr.Textbox()
|
| 15 |
+
state = gr.State()
|
| 16 |
+
with gr.Row():
|
| 17 |
+
clear = gr.Button("新话题")
|
| 18 |
+
re_generate = gr.Button("重新回答")
|
| 19 |
+
sent_bt = gr.Button("发送")
|
| 20 |
+
with gr.Accordion("生成参数", open=False):
|
| 21 |
+
slider_temp = gr.Slider(minimum=0, maximum=1, label="temperature", value=0.3)
|
| 22 |
+
slider_top_p = gr.Slider(minimum=0.5, maximum=1, label="top_p", value=0.95)
|
| 23 |
+
slider_context_times = gr.Slider(minimum=0, maximum=5, label="上文轮次", value=0,step=2.0)
|
| 24 |
+
def user(user_message, history):
|
| 25 |
+
return "", history + [[user_message, None]]
|
| 26 |
+
def bot(history,temperature,top_p,slider_context_times):
|
| 27 |
+
if pandas.isnull(history[-1][1])==False:
|
| 28 |
+
history[-1][1] = None
|
| 29 |
+
yield history
|
| 30 |
+
slider_context_times = int(slider_context_times)
|
| 31 |
+
history_true = history[1:-1]
|
| 32 |
+
prompt = ''
|
| 33 |
+
if slider_context_times>0:
|
| 34 |
+
prompt += '\n'.join([("<s>Human: "+one_chat[0].replace('<br>','\n')+'\n</s>' if one_chat[0] else '') +"<s>Assistant: "+one_chat[1].replace('<br>','\n')+'\n</s>' for one_chat in history_true[-slider_context_times:] ])
|
| 35 |
+
prompt += "<s>Human: "+history[-1][0].replace('<br>','\n')+"\n</s><s>Assistant:"
|
| 36 |
+
input_ids = tokenizer([prompt], return_tensors="pt",add_special_tokens=False).input_ids[:,-512:].to('cuda')
|
| 37 |
+
generate_input = {
|
| 38 |
+
"input_ids":input_ids,
|
| 39 |
+
"max_new_tokens":512,
|
| 40 |
+
"do_sample":True,
|
| 41 |
+
"top_k":50,
|
| 42 |
+
"top_p":top_p,
|
| 43 |
+
"temperature":temperature,
|
| 44 |
+
"repetition_penalty":1.3,
|
| 45 |
+
"streamer":streamer,
|
| 46 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 47 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 48 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 49 |
+
}
|
| 50 |
+
thread = Thread(target=model.generate, kwargs=generate_input)
|
| 51 |
+
thread.start()
|
| 52 |
+
start_time = time.time()
|
| 53 |
+
bot_message =''
|
| 54 |
+
print('Human:',history[-1][0])
|
| 55 |
+
print('Assistant: ',end='',flush=True)
|
| 56 |
+
for new_text in streamer:
|
| 57 |
+
print(new_text,end='',flush=True)
|
| 58 |
+
if len(new_text)==0:
|
| 59 |
+
continue
|
| 60 |
+
if new_text!='</s>':
|
| 61 |
+
bot_message+=new_text
|
| 62 |
+
if 'Human:' in bot_message:
|
| 63 |
+
bot_message = bot_message.split('Human:')[0]
|
| 64 |
+
history[-1][1] = bot_message
|
| 65 |
+
yield history
|
| 66 |
+
end_time =time.time()
|
| 67 |
+
print()
|
| 68 |
+
print('生成耗时:',end_time-start_time,'文字长度:',len(bot_message),'字耗时:',(end_time-start_time)/len(bot_message))
|
| 69 |
+
|
| 70 |
+
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
| 71 |
+
bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
|
| 72 |
+
)
|
| 73 |
+
sent_bt.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
|
| 74 |
+
bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot
|
| 75 |
+
)
|
| 76 |
+
re_generate.click( bot, [chatbot,slider_temp,slider_top_p,slider_context_times], chatbot )
|
| 77 |
+
clear.click(lambda: [], None, chatbot, queue=False)
|
| 78 |
+
|
| 79 |
+
if __name__ == "__main__":
|
| 80 |
+
parser = argparse.ArgumentParser()
|
| 81 |
+
parser.add_argument("--model_name_or_path", type=str, help='mode name or path')
|
| 82 |
+
parser.add_argument("--is_4bit", action='store_true', help='use 4bit model')
|
| 83 |
+
args = parser.parse_args()
|
| 84 |
+
# tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,use_fast=False)
|
| 85 |
+
# tokenizer.pad_token = tokenizer.eos_token
|
| 86 |
+
if args.is_4bit==False:
|
| 87 |
+
config = PeftConfig.from_pretrained(args.model_name_or_path)
|
| 88 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,use_fast=False)
|
| 89 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 90 |
+
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
|
| 91 |
+
device_map='cuda:0' if torch.cuda.is_available() else "auto",
|
| 92 |
+
torch_dtype=torch.float16,
|
| 93 |
+
load_in_8bit=True,
|
| 94 |
+
low_cpu_mem_usage=True,
|
| 95 |
+
trust_remote_code=True,
|
| 96 |
+
use_flash_attention_2=True)
|
| 97 |
+
model = PeftModel.from_pretrained(model, args.model_name_or_path, device_map={"": 0})
|
| 98 |
+
model.eval()
|
| 99 |
+
else:
|
| 100 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 101 |
+
model = AutoGPTQForCausalLM.from_quantized(args.model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
|
| 102 |
+
streamer = TextIteratorStreamer(tokenizer,skip_prompt=True)
|
| 103 |
+
if torch.__version__ >= "2" and sys.platform != "win32":
|
| 104 |
+
model = torch.compile(model)
|
| 105 |
+
demo.queue().launch(share=False, debug=True,server_name="0.0.0.0")
|
Llama-Chinese/examples/llama2_for_langchain.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.llms.base import LLM
|
| 2 |
+
from typing import Dict, List, Any, Optional
|
| 3 |
+
import torch,sys,os
|
| 4 |
+
from transformers import AutoTokenizer
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Llama2(LLM):
|
| 8 |
+
max_token: int = 2048
|
| 9 |
+
temperature: float = 0.1
|
| 10 |
+
top_p: float = 0.95
|
| 11 |
+
tokenizer: Any
|
| 12 |
+
model: Any
|
| 13 |
+
|
| 14 |
+
def __init__(self, model_name_or_path, bit4=False):
|
| 15 |
+
super().__init__()
|
| 16 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,use_fast=False)
|
| 17 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 18 |
+
if bit4==False:
|
| 19 |
+
from transformers import AutoModelForCausalLM
|
| 20 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 21 |
+
self.model = AutoModelForCausalLM.from_pretrained(model_name_or_path,device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
|
| 22 |
+
self.model.eval()
|
| 23 |
+
else:
|
| 24 |
+
from auto_gptq import AutoGPTQForCausalLM
|
| 25 |
+
self.model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,low_cpu_mem_usage=True, device="cuda:0", use_triton=False,inject_fused_attention=False,inject_fused_mlp=False)
|
| 26 |
+
|
| 27 |
+
if torch.__version__ >= "2" and sys.platform != "win32":
|
| 28 |
+
self.model = torch.compile(self.model)
|
| 29 |
+
|
| 30 |
+
@property
|
| 31 |
+
def _llm_type(self) -> str:
|
| 32 |
+
return "Llama2"
|
| 33 |
+
|
| 34 |
+
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
| 35 |
+
print('prompt:',prompt)
|
| 36 |
+
input_ids = self.tokenizer(prompt, return_tensors="pt",add_special_tokens=False).input_ids.to('cuda')
|
| 37 |
+
generate_input = {
|
| 38 |
+
"input_ids":input_ids,
|
| 39 |
+
"max_new_tokens":1024,
|
| 40 |
+
"do_sample":True,
|
| 41 |
+
"top_k":50,
|
| 42 |
+
"top_p":self.top_p,
|
| 43 |
+
"temperature":self.temperature,
|
| 44 |
+
"repetition_penalty":1.2,
|
| 45 |
+
"eos_token_id":self.tokenizer.eos_token_id,
|
| 46 |
+
"bos_token_id":self.tokenizer.bos_token_id,
|
| 47 |
+
"pad_token_id":self.tokenizer.pad_token_id
|
| 48 |
+
}
|
| 49 |
+
generate_ids = self.model.generate(**generate_input)
|
| 50 |
+
generate_ids = [item[len(input_ids[0]):-1] for item in generate_ids]
|
| 51 |
+
result_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
| 52 |
+
return result_message
|
Llama-Chinese/finetune_test.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
+
from peft import PeftModel,PeftConfig
|
| 4 |
+
# 例如: finetune_model_path='train/sft/output/checkpoint-6160'
|
| 5 |
+
finetune_model_path='train/sft/output/'
|
| 6 |
+
config = PeftConfig.from_pretrained(finetune_model_path)
|
| 7 |
+
# 例如: base_model_name_or_path='meta-llama/Llama-2-7b-chat'
|
| 8 |
+
print(config.base_model_name_or_path)
|
| 9 |
+
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path,use_fast=False)
|
| 10 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 11 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 12 |
+
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
|
| 13 |
+
model = PeftModel.from_pretrained(model, finetune_model_path, device_map={"": 0})
|
| 14 |
+
model =model.eval()
|
| 15 |
+
input_ids = tokenizer(['<s>Human: 介绍一下北京\n</s><s>Assistant: '], return_tensors="pt",add_special_tokens=False).input_ids
|
| 16 |
+
if torch.cuda.is_available():
|
| 17 |
+
input_ids = input_ids.to('cuda')
|
| 18 |
+
generate_input = {
|
| 19 |
+
"input_ids":input_ids,
|
| 20 |
+
"max_new_tokens":512,
|
| 21 |
+
"do_sample":True,
|
| 22 |
+
"top_k":50,
|
| 23 |
+
"top_p":0.95,
|
| 24 |
+
"temperature":0.3,
|
| 25 |
+
"repetition_penalty":1.3,
|
| 26 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 27 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 28 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 29 |
+
}
|
| 30 |
+
generate_ids = model.generate(**generate_input)
|
| 31 |
+
text = tokenizer.decode(generate_ids[0])
|
| 32 |
+
print(text)
|
Llama-Chinese/important-change.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
root/miniconda3/envs/new-Llama/lib/python3.10/site-packages/deepspeed/ops/op_builder/builder.py Line 88
|
| 2 |
+
add return True
|
Llama-Chinese/inference-speed/CPU/ggml/README.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## 使用llama.cpp量化部署
|
| 2 |
+
|
| 3 |
+
以[llama.cpp工具](https://github.com/Rayrtfr/llama.cpp)为例,介绍模型量化并在本地部署的详细步骤。Windows则可能需要cmake等编译工具的安装。**本地快速部署体验推荐使用经过指令精调的[Atom-7B-Chat](https://github.com/LlamaFamily/Llama-Chinese?tab=readme-ov-file#%E5%9F%BA%E4%BA%8Ellama2%E7%9A%84%E4%B8%AD%E6%96%87%E9%A2%84%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8Batom)模型,有条件的推荐使用6-bit或者8-bit模型,效果更佳。** 运行前请确保:
|
| 4 |
+
|
| 5 |
+
1. 系统应有`make`(MacOS/Linux自带)或`cmake`(Windows需自行安装)编译工具
|
| 6 |
+
2. 建议使用Python 3.10以上编译和运行该工具
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
### Step 1: 克隆和编译llama.cpp
|
| 10 |
+
|
| 11 |
+
1. (可选)如果已下载旧版仓库,建议`git pull`拉取最新代码,**并执行`make clean`进行清理**
|
| 12 |
+
1. 拉取最新版适配过Atom大模型的llama.cpp仓库代码
|
| 13 |
+
|
| 14 |
+
```bash
|
| 15 |
+
$ git clone https://github.com/Rayrtfr/llama.cpp
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
2. 对llama.cpp项目进行编译,生成`./main`(用于推理)和`./quantize`(用于量化)二进制文件。
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
$ make
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
**Windows/Linux用户**如需启用GPU推理,则推荐与[BLAS(或cuBLAS如果有GPU)一起编译](https://github.com/Rayrtfr/llama.cpp#blas-build),可以提高prompt处理速度。以下是和cuBLAS一起编译的命令,适用于NVIDIA相关GPU。参考:[llama.cpp#blas-build](https://github.com/Rayrtfr/llama.cpp#blas-build)
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
$ make LLAMA_CUBLAS=1
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
**macOS用户**无需额外操作,llama.cpp已对ARM NEON做优化,并且已自动启用BLAS。M系列芯片推荐使用Metal启用GPU推理,显著提升速度。只需将编译命令改为:`LLAMA_METAL=1 make`,参考[llama.cpp#metal-build](https://github.com/Rayrtfr/llama.cpp#metal-build)
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
$ LLAMA_METAL=1 make
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
### Step 2: 生成量化版本模型
|
| 37 |
+
|
| 38 |
+
目前llama.cpp已支持`.safetensors`文件以及huggingface格式`.bin`转换为GGUF的FP16格式。
|
| 39 |
+
|
| 40 |
+
/path/Atom-7B-Chat是模型下载的目录位置。
|
| 41 |
+
```bash
|
| 42 |
+
$ python convert.py --outfile ./atom-7B-cpp.gguf /path/Atom-7B-Chat
|
| 43 |
+
|
| 44 |
+
$ ./quantize ./atom-7B-cpp.gguf ./ggml-atom-7B-q4_0.gguf q4_0
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
+
### Step 3: 加载并启动模型
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
- 如果想使用GPU推理:cuBLAS/Metal编译需要指定offload层数,在`./main`中指定例如`-ngl 40`表示offload 40层模型参数到GPU
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
使用以下命令启动聊天。
|
| 54 |
+
```bash
|
| 55 |
+
text="<s>Human: 介绍一下北京\n</s><s>Assistant:"
|
| 56 |
+
./main -m \
|
| 57 |
+
./ggml-atom-7B-q4_0.gguf \
|
| 58 |
+
-p "${text}" \
|
| 59 |
+
--logdir ./logtxt
|
| 60 |
+
```
|
| 61 |
+
如果要带聊天的上下文,上面的text需要调整成类似这样:
|
| 62 |
+
```bash
|
| 63 |
+
text="<s>Human: 介绍一下北京\n</s><s>Assistant:北京是一个美丽的城市</s>\n<s>Human: 再介绍一下合肥\n</s><s>Assistant:"
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
更详细的官方说明请参考:[https://github.com/ggerganov/llama.cpp/tree/master/examples/main](https://github.com/ggerganov/llama.cpp/tree/master/examples/main)
|
Llama-Chinese/inference-speed/GPU/FasterTransformer_example/README.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FasterTransformer && Triton 安装和使用
|
| 2 |
+
|
| 3 |
+
FasterTransformer & Triton 加速LLama2模型推理。 目前支持fp16或者Int8推理,Int4目前还不支持。
|
| 4 |
+
|
| 5 |
+
## 0. 准备环境变量
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
export BUILD_DICTIONARY="/workspace/build"
|
| 9 |
+
export TRITON_VERSION=23.04
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
## 一. 镜像构建
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
1. 构建镜像
|
| 17 |
+
|
| 18 |
+
```bash
|
| 19 |
+
cd $BUILD_DICTIONARY
|
| 20 |
+
git clone https://github.com/Rayrtfr/fastertransformer_backend.git
|
| 21 |
+
|
| 22 |
+
cd $BUILD_DICTIONARY/fastertransformer_backend
|
| 23 |
+
|
| 24 |
+
export TRITON_VERSION=23.04
|
| 25 |
+
|
| 26 |
+
# 如何不想通过下面的命令构建,也可以直接下载我们已经构建好的镜像: docker pull xiangtao1994/atom_triton_ft:23.04
|
| 27 |
+
docker build --build-arg TRITON_VERSION=${TRITON_VERSION} -t triton_ft_backend:${TRITON_VERSION} -f docker/Dockerfile .
|
| 28 |
+
|
| 29 |
+
```
|
| 30 |
+
TRITON_VERSION=23.04 这个镜像需的GPU的驱动版本是 Driver Version: 535.54.03,如果你的GPU的驱动不是这个版本,需要[https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/rel-22-12.html#rel-22-12)
|
| 31 |
+
找到cuda driver 对应版本的 triton-inference-server。
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
2.启动容器
|
| 35 |
+
|
| 36 |
+
```
|
| 37 |
+
# 启动容器
|
| 38 |
+
export TRITON_VERSION=23.04
|
| 39 |
+
|
| 40 |
+
# 注意需要 BUILD_DICTIONARY 挂载到容器里面
|
| 41 |
+
docker run -idt --gpus=all --net=host --shm-size=4G --name triton_ft_backend_pure \
|
| 42 |
+
-v $BUILD_DICTIONARY:$BUILD_DICTIONARY \
|
| 43 |
+
-p18888:8888 -p18000:8000 -p18001:8001 -p18002:8002 triton_ft_backend:${TRITON_VERSION} bash
|
| 44 |
+
|
| 45 |
+
````
|
| 46 |
+
|
| 47 |
+
## 二.容器内操作
|
| 48 |
+
|
| 49 |
+
下面介绍一下[Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat)模型的权重转换成FasterTransformer格式。 [Llama2-Chinese-13b-Chat](https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat)也是类似的方式。
|
| 50 |
+
|
| 51 |
+
1. 转换权重, 权重转换成FasterTransformer格式
|
| 52 |
+
|
| 53 |
+
```
|
| 54 |
+
cd $BUILD_DICTIONARY && git clone https://github.com/Rayrtfr/FasterTransformer.git
|
| 55 |
+
|
| 56 |
+
cd $BUILD_DICTIONARY/FasterTransformer
|
| 57 |
+
|
| 58 |
+
mkdir models && chmod -R 777 ./*
|
| 59 |
+
|
| 60 |
+
python3 ./examples/cpp/llama/huggingface_llama_convert.py \
|
| 61 |
+
-saved_dir=./models/llama \
|
| 62 |
+
-in_file=/path/FlagAlpha/Atom-7B-Chat \
|
| 63 |
+
-infer_gpu_num=1 \
|
| 64 |
+
-weight_data_type=fp16 \
|
| 65 |
+
-model_name=llama
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
2. 修改模型配置
|
| 69 |
+
|
| 70 |
+
- 编辑config.pbtxt
|
| 71 |
+
|
| 72 |
+
``` bash
|
| 73 |
+
mkdir $BUILD_DICTIONARY/triton-model-store/
|
| 74 |
+
|
| 75 |
+
cd $BUILD_DICTIONARY/triton-model-store/
|
| 76 |
+
|
| 77 |
+
cp -r $BUILD_DICTIONARY/fastertransformer_backend/all_models/llama $BUILD_DICTIONARY/triton-model-store/
|
| 78 |
+
|
| 79 |
+
# 修改 triton-model-store/llama/fastertransformer/config.pbtxt
|
| 80 |
+
|
| 81 |
+
parameters {
|
| 82 |
+
key: "tensor_para_size"
|
| 83 |
+
value: {
|
| 84 |
+
string_value: "1"
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
## 修改 model_checkpoint_path 为上面转换之后的路径
|
| 89 |
+
parameters {
|
| 90 |
+
key: "model_checkpoint_path"
|
| 91 |
+
value: {
|
| 92 |
+
string_value: "/workspace/build/FasterTransformer/models/llama/1-gpu/"
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
## 模型使用int8推理需要加一下面的配置
|
| 97 |
+
parameters {
|
| 98 |
+
key: "int8_mode"
|
| 99 |
+
value: {
|
| 100 |
+
string_value: "1"
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
修改 model.py
|
| 107 |
+
|
| 108 |
+
```
|
| 109 |
+
# 修改这两个文件
|
| 110 |
+
triton-model-store/llama/preprocessing/1/model.py
|
| 111 |
+
triton-model-store/llama/postprocessing/1/model.py
|
| 112 |
+
|
| 113 |
+
# 检查 这个路径为tokenier对应的路径
|
| 114 |
+
self.tokenizer = LlamaTokenizer.from_pretrained("/path/FlagAlpha/Atom-7B-Chat")
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
3. 编译 FasterTransformer Library
|
| 119 |
+
|
| 120 |
+
(同一类型的模型,编译一次就行了)
|
| 121 |
+
编译之前检查 FasterTransformer/examples/cpp/llama/llama_config.ini
|
| 122 |
+
|
| 123 |
+
```bash
|
| 124 |
+
# 单卡推理这里是1,多卡可以改成卡的数目
|
| 125 |
+
tensor_para_size=1
|
| 126 |
+
|
| 127 |
+
model_dir=/workspace/build/FasterTransformer/models/llama/1-gpu/
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
编译 FasterTransformer
|
| 131 |
+
```bash
|
| 132 |
+
cd $BUILD_DICTIONARY/FasterTransformer
|
| 133 |
+
|
| 134 |
+
git submodule init && git submodule update
|
| 135 |
+
pip3 install fire jax jaxlib transformers
|
| 136 |
+
|
| 137 |
+
mkdir build && cd build
|
| 138 |
+
cmake -DSM=86 -DCMAKE_BUILD_TYPE=Release -DBUILD_PYT=ON -DBUILD_MULTI_GPU=ON -D PYTHON_PATH=/usr/bin/python3 ..
|
| 139 |
+
make -j12
|
| 140 |
+
make install
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
## 三. 启动 triton server
|
| 145 |
+
|
| 146 |
+
同样在上面的容器内操作。
|
| 147 |
+
```
|
| 148 |
+
CUDA_VISIBLE_DEVICES=0 /opt/tritonserver/bin/tritonserver --model-repository=$BUILD_DICTIONARY/triton-model-store/llama/
|
| 149 |
+
```
|
| 150 |
+
输出
|
| 151 |
+
```
|
| 152 |
+
I0717 17:17:14.670037 70681 grpc_server.cc:2450] Started GRPCInferenceService at 0.0.0.0:8001
|
| 153 |
+
I0717 17:17:14.670495 70681 http_server.cc:3555] Started HTTPService at 0.0.0.0:8000
|
| 154 |
+
I0717 17:17:14.713000 70681 http_server.cc:185] Started Metrics Service at 0.0.0.0:8002
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
同样在上面的容器内操作,启动client测试(如果在容器外注意需要修改下面的url参数的端口号)
|
| 159 |
+
|
| 160 |
+
```
|
| 161 |
+
python3 $BUILD_DICTIONARY/fastertransformer_backend/inference_example/llama/llama_grpc_stream_client.py \
|
| 162 |
+
--url 127.0.0.1:8001 \
|
| 163 |
+
--hf_model_location /path/FlagAlpha/Atom-7B-Chat \
|
| 164 |
+
-topp 0.95
|
| 165 |
+
```
|
Llama-Chinese/inference-speed/GPU/JittorLLMs_example/README.md
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# JittorLLMs推理部署
|
| 2 |
+
|
| 3 |
+
## 配置要求
|
| 4 |
+
|
| 5 |
+
* 内存要求:至少2G,推荐32G
|
| 6 |
+
* 显存:可选, 推荐16G
|
| 7 |
+
* 操作系统:支持Windows,Mac,Linux全平台。
|
| 8 |
+
* 磁盘空间:至少40GB空闲磁盘空间,用于下载参数和存储交换文件。
|
| 9 |
+
* Python版本要求至少`3.9`。
|
| 10 |
+
|
| 11 |
+
磁盘空间不够时,可以通过环境变量`JITTOR_HOME`指定缓存存放路径。
|
| 12 |
+
内存或者显存不够,出现进程被杀死的情况,请参考下方,[限制内存消耗的方法](#配置要求低)。
|
| 13 |
+
|
| 14 |
+
## 部署方法
|
| 15 |
+
|
| 16 |
+
可以通过下述指令安装依赖。(注意:此脚本会安装Jittor版torch,推荐用户新建环境运行)
|
| 17 |
+
|
| 18 |
+
```
|
| 19 |
+
# 国内使用 gitlink clone
|
| 20 |
+
git clone https://gitlink.org.cn/jittor/JittorLLMs.git --depth 1
|
| 21 |
+
# github: git clone https://github.com/Jittor/JittorLLMs.git --depth 1
|
| 22 |
+
cd JittorLLMs
|
| 23 |
+
# -i 指定用jittor的源, -I 强制重装Jittor版torch
|
| 24 |
+
pip install -r requirements.txt -i https://pypi.jittor.org/simple -I
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
如果出现找不到jittor版本的错误,可能是您使用的镜像还没有更新,使用如下命令更新最新版:`pip install jittor -U -i https://pypi.org/simple`
|
| 28 |
+
|
| 29 |
+
部署只需一行命令即可:
|
| 30 |
+
|
| 31 |
+
```
|
| 32 |
+
python cli_demo.py atom7b
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
运行后会自动从服务器上下载模型文件到本地,会占用根目录下一定的硬盘空间。
|
| 36 |
+
最开始运行的时候会编译一些CUDA算子,这会花费一些时间进行加载。
|
| 37 |
+
|
| 38 |
+
内存或者显存不够,出现进程被杀死的情况,请参考下方,[限制内存消耗的方法](#配置要求低)。
|
| 39 |
+
|
| 40 |
+
### WebDemo
|
| 41 |
+
|
| 42 |
+
JittorLLM通过gradio库,允许用户在浏览器之中和大模型直接进行对话。
|
| 43 |
+
|
| 44 |
+
~~~bash
|
| 45 |
+
python web_demo.py atom7b
|
| 46 |
+
~~~
|
| 47 |
+
|
| 48 |
+
### 后端服务部署
|
| 49 |
+
|
| 50 |
+
JittorLLM在api.py文件之中,提供了一个架设后端服务的示例。
|
| 51 |
+
|
| 52 |
+
~~~bash
|
| 53 |
+
python api.py atom7b
|
| 54 |
+
~~~
|
| 55 |
+
|
| 56 |
+
接着可以使用如下代码进行直接访问
|
| 57 |
+
|
| 58 |
+
~~~python
|
| 59 |
+
post_data = json.dumps({'prompt': 'Hello, solve 5x=13'})
|
| 60 |
+
print(json.loads(requests.post("http://0.0.0.0:8000", post_data).text)['response'])
|
| 61 |
+
~~~
|
| 62 |
+
|
| 63 |
+
## 配置要求低
|
| 64 |
+
|
| 65 |
+
针对大模型显存消耗大等痛点,Jittor团队研发了动态交换技术,Jittor框架是世界上首个支持动态图变量自动交换功能的框架,区别于以往的基于静态图交换技术,用户不需要修改任何代码,原生的动态图代码即可直接支持张量交换,张量数据可以在显存-内存-硬盘之间自动交换,降低用户开发难度。
|
| 66 |
+
|
| 67 |
+
同时,Jittor大模型推理库也是目前对配置门槛要求最低的框架,只需要参数磁盘空间和2G内存,无需显卡,也可以部署大模型,下面是在不同硬件配置条件下的资源消耗与速度对比。可以发现,JittorLLMs在显存充足的情况下,性能优于同类框架,而显存不足甚至没有显卡,JittorLLMs都能以一定速度运行。
|
| 68 |
+
|
| 69 |
+
节省内存方法,请安装Jittor版本大于1.3.7.8,并添加如下环境变量:
|
| 70 |
+
```bash
|
| 71 |
+
export JT_SAVE_MEM=1
|
| 72 |
+
# 限制cpu最多使用16G
|
| 73 |
+
export cpu_mem_limit=16000000000
|
| 74 |
+
# 限制device内存(如gpu、tpu等)最多使用8G
|
| 75 |
+
export device_mem_limit=8000000000
|
| 76 |
+
# windows 用户,请使用powershell
|
| 77 |
+
# $env:JT_SAVE_MEM="1"
|
| 78 |
+
# $env:cpu_mem_limit="16000000000"
|
| 79 |
+
# $env:device_mem_limit="8000000000"
|
| 80 |
+
```
|
| 81 |
+
用户可以自由设定cpu和设备内存的使用量,如果不希望对内存进行限制,可以设置为`-1`。
|
| 82 |
+
```bash
|
| 83 |
+
# 限制cpu最多使用16G
|
| 84 |
+
export cpu_mem_limit=-1
|
| 85 |
+
# 限制device内存(如gpu、tpu等)最多使用8G
|
| 86 |
+
export device_mem_limit=-1
|
| 87 |
+
# windows 用户,请使用powershell
|
| 88 |
+
# $env:JT_SAVE_MEM="1"
|
| 89 |
+
# $env:cpu_mem_limit="-1"
|
| 90 |
+
# $env:device_mem_limit="-1"
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
如果想要清理磁盘交换文件,可以运行如下命令
|
| 94 |
+
```bash
|
| 95 |
+
python -m jittor_utils.clean_cache swap
|
| 96 |
+
```
|
Llama-Chinese/inference-speed/GPU/TensorRT-LLM_example/README.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 使用NVIDIA TensorRT-LLM部署LLama2 或者Atom
|
| 2 |
+
|
| 3 |
+
[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/main)是NVIDIA开发的高性能推理框架,您可以按照以下步骤来使用TensorRT-LLM部署LLama2模型或者Atom模型。
|
| 4 |
+
|
| 5 |
+
以下部署流程参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama),需要机器Nvidia显卡驱动535版本以上
|
| 6 |
+
|
| 7 |
+
## Support Matrix
|
| 8 |
+
* FP16
|
| 9 |
+
* FP8
|
| 10 |
+
* INT8 & INT4 Weight-Only
|
| 11 |
+
* SmoothQuant
|
| 12 |
+
* Groupwise quantization (AWQ/GPTQ)
|
| 13 |
+
* FP8 KV CACHE
|
| 14 |
+
* INT8 KV CACHE (+ AWQ/per-channel weight-only)
|
| 15 |
+
* Tensor Parallel
|
| 16 |
+
* STRONGLY TYPED
|
| 17 |
+
|
| 18 |
+
## 1. 安装TensorRT-LLM
|
| 19 |
+
#### 获取TensorRT-LLM代码:
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
# TensorRT-LLM 代码需要使用 git-lfs 拉取
|
| 23 |
+
apt-get update && apt-get -y install git git-lfs
|
| 24 |
+
|
| 25 |
+
git clone https://github.com/NVIDIA/TensorRT-LLM.git
|
| 26 |
+
cd TensorRT-LLM
|
| 27 |
+
|
| 28 |
+
# 本流程将使用 v0.7.0 Release 版本
|
| 29 |
+
git checkout tags/v0.7.0 -b release/0.7.0
|
| 30 |
+
git submodule update --init --recursive
|
| 31 |
+
git lfs install
|
| 32 |
+
git lfs pull
|
| 33 |
+
```
|
| 34 |
+
#### 构建docker镜像并安装TensorRT-LLM
|
| 35 |
+
```bash
|
| 36 |
+
make -C docker release_build
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
#### 运行docker镜像:
|
| 40 |
+
```bash
|
| 41 |
+
make -C docker release_run
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
## 2. 为LLama2模型构建TensorRT-LLM推理引擎:
|
| 45 |
+
|
| 46 |
+
#### 进入build文件夹:
|
| 47 |
+
```bash
|
| 48 |
+
cd ./examples/llama
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
#### 从Huggingface下载Atom或者LLama2模型:
|
| 52 |
+
```
|
| 53 |
+
# 您可以选择具体想部署的模型下载
|
| 54 |
+
git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat
|
| 55 |
+
mv Atom-7B-Chat /origin_model
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
#### 使用build.py 构建推理引擎:
|
| 59 |
+
以下是一个常见事例,更多参数参考[TensorRT-LLM/example/llama](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama)
|
| 60 |
+
```bash
|
| 61 |
+
python build.py --max_batch_size 1 --max_num_tokens 8192 --model_dir /origin_model --dtype float16 --remove_input_padding --use_inflight_batching --paged_kv_cache --use_weight_only --enable_context_fmha --use_gpt_attention_plugin float16 --use_gemm_plugin float16 --output_dir /model/tensorrt_llm/1 --world_size 1 --tp_size 1 --pp_size 1 --max_input_len 7168 --max_output_len 1024 --multi_block_mode --rotary_scaling dynamic 8.0 --rotary_base 500000
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## 3. 使用TensorRT-LLM Python Runtime进行推理
|
| 65 |
+
|
| 66 |
+
#### 使用我们提供的python代码类,启动单机单卡服务
|
| 67 |
+
```bash
|
| 68 |
+
python atom_inference.py \
|
| 69 |
+
/model/tensorrt_llm/1 \ # 第一个参数 build.py 的output路径
|
| 70 |
+
/origin_model \ # 第二个参数模型tokenizer的路径
|
| 71 |
+
如何成为一个更加优秀的人 # 希望问的问题
|
| 72 |
+
```
|
Llama-Chinese/inference-speed/GPU/TensorRT-LLM_example/atom_inference.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import torch
|
| 7 |
+
from utils import (DEFAULT_HF_MODEL_DIRS, DEFAULT_PROMPT_TEMPLATES,
|
| 8 |
+
load_tokenizer, read_model_name, throttle_generator)
|
| 9 |
+
|
| 10 |
+
import tensorrt_llm
|
| 11 |
+
from tensorrt_llm.logger import logger
|
| 12 |
+
from tensorrt_llm.runtime import PYTHON_BINDINGS, ModelRunner
|
| 13 |
+
|
| 14 |
+
if PYTHON_BINDINGS:
|
| 15 |
+
from tensorrt_llm.runtime import ModelRunnerCpp
|
| 16 |
+
|
| 17 |
+
class AtomTRTApi:
|
| 18 |
+
def __init__(self,engine_dir,tokenizer_dir,max_input_length=4096):
|
| 19 |
+
self.runtime_rank = tensorrt_llm.mpi_rank()
|
| 20 |
+
self.model_name = read_model_name(engine_dir)
|
| 21 |
+
|
| 22 |
+
self.tokenizer, self.pad_id, self.end_id = load_tokenizer(
|
| 23 |
+
tokenizer_dir=tokenizer_dir,
|
| 24 |
+
tokenizer_type='llama',
|
| 25 |
+
)
|
| 26 |
+
self.use_py_session=False
|
| 27 |
+
if not PYTHON_BINDINGS:
|
| 28 |
+
logger.warning(
|
| 29 |
+
"Python bindings of C++ session is unavailable, fallback to Python session."
|
| 30 |
+
)
|
| 31 |
+
self.use_py_session = True
|
| 32 |
+
runner_cls = ModelRunner if self.use_py_session else ModelRunnerCpp
|
| 33 |
+
runner_kwargs = dict(engine_dir=engine_dir,
|
| 34 |
+
lora_dir=None,
|
| 35 |
+
rank=self.runtime_rank,
|
| 36 |
+
debug_mode=False,
|
| 37 |
+
lora_ckpt_source='hf')
|
| 38 |
+
|
| 39 |
+
if not self.use_py_session:
|
| 40 |
+
runner_kwargs.update(
|
| 41 |
+
max_batch_size=1,
|
| 42 |
+
max_input_len=max_input_length,
|
| 43 |
+
max_output_len=2048,
|
| 44 |
+
max_beam_width=1,
|
| 45 |
+
max_attention_window_size=None)
|
| 46 |
+
self.runner = runner_cls.from_dir(**runner_kwargs)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def ask(self,input_text,temperature=0.4,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',merge_lambda=None,max_input_length=4096,append_next_role=True):
|
| 50 |
+
with torch.no_grad():
|
| 51 |
+
prompt = ''
|
| 52 |
+
print('max_input_length',max_input_length)
|
| 53 |
+
if type(input_text)==list:
|
| 54 |
+
for input_text_one in input_text[::-1]:
|
| 55 |
+
if len(prompt) + len("<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>")<max_input_length:
|
| 56 |
+
prompt = "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>" + prompt
|
| 57 |
+
if append_next_role:
|
| 58 |
+
if input_text[-1]['role']=='Human':
|
| 59 |
+
prompt += "<s>Assistant:"
|
| 60 |
+
else:
|
| 61 |
+
prompt += "<s>Human:"
|
| 62 |
+
else:
|
| 63 |
+
if merge_lambda is None:
|
| 64 |
+
if append_next_role:
|
| 65 |
+
prompt += "<s>Human: "+input_text.strip()+"\n</s><s>Assistant:"
|
| 66 |
+
else:
|
| 67 |
+
prompt += "<s>Human: "+input_text.strip()+"\n</s>"
|
| 68 |
+
else:
|
| 69 |
+
prompt += merge_lambda(input_text)
|
| 70 |
+
if len(system_prefix)>0:
|
| 71 |
+
prompt = '<s>System: '+system_prefix.strip()+'\n</s>'+prompt
|
| 72 |
+
print('输入模型的完整输入:',prompt)
|
| 73 |
+
input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids]
|
| 74 |
+
print(input_ids)
|
| 75 |
+
input_ids = [
|
| 76 |
+
torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids
|
| 77 |
+
]
|
| 78 |
+
print('输入模型的token数量',input_ids[0].shape)
|
| 79 |
+
generate_input = {
|
| 80 |
+
"batch_input_ids":input_ids,
|
| 81 |
+
"max_new_tokens":max_new_tokens,
|
| 82 |
+
"max_attention_window_size":None,
|
| 83 |
+
"do_sample":True,
|
| 84 |
+
"top_k":50,
|
| 85 |
+
"top_p":top_p,
|
| 86 |
+
"num_beams":1,
|
| 87 |
+
"length_penalty":1.0,
|
| 88 |
+
"stop_words_list":None,
|
| 89 |
+
"bad_words_list":None,
|
| 90 |
+
"streaming":False,
|
| 91 |
+
"temperature":temperature,
|
| 92 |
+
"output_sequence_lengths":True,
|
| 93 |
+
"return_dict":False,
|
| 94 |
+
"repetition_penalty":repetition_penalty,
|
| 95 |
+
"end_id":self.tokenizer.eos_token_id,
|
| 96 |
+
"bos_token_id":self.tokenizer.bos_token_id,
|
| 97 |
+
"pad_id":self.tokenizer.pad_token_id
|
| 98 |
+
}
|
| 99 |
+
generate_ids = self.runner.generate(**generate_input)
|
| 100 |
+
torch.cuda.synchronize()
|
| 101 |
+
print(generate_ids)
|
| 102 |
+
generate_ids = generate_ids.cpu().tolist()
|
| 103 |
+
generate_ids = [item[0][len(input_ids[0][0]):] for item in generate_ids]
|
| 104 |
+
try:
|
| 105 |
+
generate_ids = [item[:item.index(self.tokenizer.eos_token_id)] for item in generate_ids ]
|
| 106 |
+
except:
|
| 107 |
+
pass
|
| 108 |
+
print(generate_ids)
|
| 109 |
+
# output = ''.join(tokenizer.convert_ids_to_tokens(generate_ids[0]))
|
| 110 |
+
# print('生成的token长度',len(generate_ids[0]))
|
| 111 |
+
bot_message = self.tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
|
| 112 |
+
if 'Human:' in bot_message:
|
| 113 |
+
bot_message = bot_message.split('Human:')[0]
|
| 114 |
+
print(bot_message)
|
| 115 |
+
return bot_message.strip()
|
| 116 |
+
|
| 117 |
+
def ask_streaming(self,input_text,temperature=0.8,top_p=0.95,max_new_tokens=1024,repetition_penalty=1.2,system_prefix = '',max_input_length=4096,append_next_role=True):
|
| 118 |
+
with torch.no_grad():
|
| 119 |
+
prompt = ''
|
| 120 |
+
print('max_input_length',max_input_length)
|
| 121 |
+
if type(input_text)==list:
|
| 122 |
+
for input_text_one in input_text[::-1]:
|
| 123 |
+
if len(prompt) + len("<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>")<max_input_length:
|
| 124 |
+
prompt = "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>" + prompt
|
| 125 |
+
if append_next_role:
|
| 126 |
+
if input_text[-1]['role']=='Human':
|
| 127 |
+
prompt += "<s>Assistant:"
|
| 128 |
+
else:
|
| 129 |
+
prompt += "<s>Human:"
|
| 130 |
+
else:
|
| 131 |
+
if append_next_role:
|
| 132 |
+
prompt += "<s>Human: "+input_text.strip()+"\n</s><s>Assistant:"
|
| 133 |
+
else:
|
| 134 |
+
prompt += "<s>Human: "+input_text.strip()+"\n</s>"
|
| 135 |
+
if len(system_prefix)>0:
|
| 136 |
+
prompt = '<s>System: '+system_prefix.strip()+'\n</s>'+prompt
|
| 137 |
+
print('输入模型的完整输入:',prompt)
|
| 138 |
+
input_ids = [self.tokenizer(prompt,add_special_tokens=False).input_ids]
|
| 139 |
+
print(input_ids)
|
| 140 |
+
input_ids = [
|
| 141 |
+
torch.tensor(x, dtype=torch.int32).unsqueeze(0) for x in input_ids
|
| 142 |
+
]
|
| 143 |
+
print('输入模型的token数量',input_ids[0].shape)
|
| 144 |
+
generate_input = {
|
| 145 |
+
"batch_input_ids":input_ids,
|
| 146 |
+
"max_new_tokens":max_new_tokens,
|
| 147 |
+
"max_attention_window_size":None,
|
| 148 |
+
"do_sample":True,
|
| 149 |
+
"top_k":50,
|
| 150 |
+
"top_p":top_p,
|
| 151 |
+
"num_beams":1,
|
| 152 |
+
"length_penalty":1.0,
|
| 153 |
+
"stop_words_list":None,
|
| 154 |
+
"bad_words_list":None,
|
| 155 |
+
"streaming":True,
|
| 156 |
+
"temperature":temperature,
|
| 157 |
+
"output_sequence_lengths":True,
|
| 158 |
+
"return_dict":True,
|
| 159 |
+
"repetition_penalty":repetition_penalty,
|
| 160 |
+
"end_id":self.tokenizer.eos_token_id,
|
| 161 |
+
"bos_token_id":self.tokenizer.bos_token_id,
|
| 162 |
+
"pad_id":self.tokenizer.pad_token_id
|
| 163 |
+
}
|
| 164 |
+
generate_ids = self.runner.generate(**generate_input)
|
| 165 |
+
torch.cuda.synchronize()
|
| 166 |
+
|
| 167 |
+
input_token_num = len(input_ids[0][0])
|
| 168 |
+
answer_message =''
|
| 169 |
+
for curr_outputs in throttle_generator(generate_ids,2):
|
| 170 |
+
output_ids = curr_outputs['output_ids']
|
| 171 |
+
sequence_lengths = curr_outputs['sequence_lengths']
|
| 172 |
+
# print(sequence_lengths)
|
| 173 |
+
output_ids = output_ids.cpu().tolist()
|
| 174 |
+
output_ids = [item[0][input_token_num:sequence_lengths[0][0]] for item in output_ids]
|
| 175 |
+
answer_message = self.tokenizer.batch_decode(output_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
|
| 176 |
+
if 'Human:' in answer_message:
|
| 177 |
+
answer_message = answer_message.split('Human:')[0]
|
| 178 |
+
yield answer_message.strip()
|
| 179 |
+
return answer_message.strip()
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if __name__=='__main__':
|
| 183 |
+
model = AtomTRTApi(engine_dir=sys.argv[1],tokenizer_dir=sys.argv[2])
|
| 184 |
+
model.ask('如何成为一个更优秀的人')
|
Llama-Chinese/inference-speed/GPU/TensorRT-LLM_example/utils.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import Optional
|
| 19 |
+
from typing import Union
|
| 20 |
+
|
| 21 |
+
from transformers import AutoTokenizer, T5Tokenizer
|
| 22 |
+
|
| 23 |
+
import tensorrt_llm
|
| 24 |
+
|
| 25 |
+
DEFAULT_HF_MODEL_DIRS = {
|
| 26 |
+
'baichuan': 'baichuan-inc/Baichuan-13B-Chat',
|
| 27 |
+
'bloom': 'bigscience/bloom-560m',
|
| 28 |
+
'chatglm_6b': 'THUDM/chatglm-6b',
|
| 29 |
+
'chatglm2_6b': 'THUDM/chatglm2-6b',
|
| 30 |
+
'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k',
|
| 31 |
+
'chatglm3_6b': 'THUDM/chatglm3-6b',
|
| 32 |
+
'chatglm3_6b_base': 'THUDM/chatglm3-6b-base',
|
| 33 |
+
'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k',
|
| 34 |
+
'falcon': 'tiiuae/falcon-rw-1b',
|
| 35 |
+
'glm_10b': 'THUDM/glm-10b',
|
| 36 |
+
'gpt': 'gpt2-medium',
|
| 37 |
+
'gptj': 'EleutherAI/gpt-j-6b',
|
| 38 |
+
'gptneox': 'EleutherAI/gpt-neox-20b',
|
| 39 |
+
'internlm': 'internlm/internlm-chat-7b',
|
| 40 |
+
'llama': 'meta-llama/Llama-2-7b-hf',
|
| 41 |
+
'mpt': 'mosaicml/mpt-7b',
|
| 42 |
+
'phi': 'microsoft/phi-2',
|
| 43 |
+
'opt': 'facebook/opt-350m',
|
| 44 |
+
'qwen': 'Qwen/Qwen-7B',
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
DEFAULT_PROMPT_TEMPLATES = {
|
| 48 |
+
'internlm':
|
| 49 |
+
"<|User|>:{input_text}<eoh>\n<|Bot|>:",
|
| 50 |
+
'qwen':
|
| 51 |
+
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def get_engine_version(engine_dir: str) -> Union[None, str]:
|
| 55 |
+
engine_dir = Path(engine_dir)
|
| 56 |
+
config_path = engine_dir / "config.json"
|
| 57 |
+
with open(config_path, 'r') as f:
|
| 58 |
+
config = json.load(f)
|
| 59 |
+
|
| 60 |
+
if 'version' not in config:
|
| 61 |
+
return None
|
| 62 |
+
|
| 63 |
+
return config['version']
|
| 64 |
+
|
| 65 |
+
def read_model_name(engine_dir: str):
|
| 66 |
+
engine_version = get_engine_version(engine_dir)
|
| 67 |
+
|
| 68 |
+
with open(Path(engine_dir) / "config.json", 'r') as f:
|
| 69 |
+
config = json.load(f)
|
| 70 |
+
|
| 71 |
+
if engine_version is None:
|
| 72 |
+
return config['builder_config']['name']
|
| 73 |
+
|
| 74 |
+
return config['pretrained_config']['architecture']
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def throttle_generator(generator, stream_interval):
|
| 78 |
+
for i, out in enumerate(generator):
|
| 79 |
+
if not i % stream_interval:
|
| 80 |
+
yield out
|
| 81 |
+
|
| 82 |
+
if i % stream_interval:
|
| 83 |
+
yield out
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def load_tokenizer(tokenizer_dir: Optional[str] = None,
|
| 87 |
+
vocab_file: Optional[str] = None,
|
| 88 |
+
model_name: str = 'gpt',
|
| 89 |
+
tokenizer_type: Optional[str] = None):
|
| 90 |
+
if vocab_file is None:
|
| 91 |
+
use_fast = True
|
| 92 |
+
if tokenizer_type is not None and tokenizer_type == "llama":
|
| 93 |
+
use_fast = False
|
| 94 |
+
# Should set both padding_side and truncation_side to be 'left'
|
| 95 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
|
| 96 |
+
legacy=False,
|
| 97 |
+
padding_side='left',
|
| 98 |
+
truncation_side='left',
|
| 99 |
+
trust_remote_code=True,
|
| 100 |
+
tokenizer_type=tokenizer_type,
|
| 101 |
+
use_fast=use_fast)
|
| 102 |
+
else:
|
| 103 |
+
# For gpt-next, directly load from tokenizer.model
|
| 104 |
+
assert model_name == 'gpt'
|
| 105 |
+
tokenizer = T5Tokenizer(vocab_file=vocab_file,
|
| 106 |
+
padding_side='left',
|
| 107 |
+
truncation_side='left')
|
| 108 |
+
|
| 109 |
+
if model_name == 'qwen':
|
| 110 |
+
with open(Path(tokenizer_dir) / "generation_config.json") as f:
|
| 111 |
+
gen_config = json.load(f)
|
| 112 |
+
chat_format = gen_config['chat_format']
|
| 113 |
+
if chat_format == 'raw':
|
| 114 |
+
pad_id = gen_config['pad_token_id']
|
| 115 |
+
end_id = gen_config['eos_token_id']
|
| 116 |
+
elif chat_format == 'chatml':
|
| 117 |
+
pad_id = tokenizer.im_end_id
|
| 118 |
+
end_id = tokenizer.im_end_id
|
| 119 |
+
else:
|
| 120 |
+
raise Exception(f"unknown chat format: {chat_format}")
|
| 121 |
+
elif model_name == 'glm_10b':
|
| 122 |
+
pad_id = tokenizer.pad_token_id
|
| 123 |
+
end_id = tokenizer.eop_token_id
|
| 124 |
+
else:
|
| 125 |
+
if tokenizer.pad_token_id is None:
|
| 126 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 127 |
+
pad_id = tokenizer.pad_token_id
|
| 128 |
+
end_id = tokenizer.eos_token_id
|
| 129 |
+
|
| 130 |
+
return tokenizer, pad_id, end_id
|
Llama-Chinese/inference-speed/GPU/lmdeploy_example/README.md
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# lmdeploy 安装和使用
|
| 2 |
+
|
| 3 |
+
lmdeploy 支持 transformer 结构(例如 Atom、LLaMA、LLaMa2、InternLM、Vicuna 等),目前支持 fp16,int8 和 int4。
|
| 4 |
+
|
| 5 |
+
## 一、安装
|
| 6 |
+
|
| 7 |
+
安装预编译的 python 包
|
| 8 |
+
```
|
| 9 |
+
python3 -m pip install lmdeploy==0.2.1
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
## 二、转换huggingface模型为lmdeploy格式
|
| 13 |
+
|
| 14 |
+
把模型转成 lmdeploy 推理格式,假设 huggingface 版 [Atom-7B-Chat](https://huggingface.co/FlagAlpha/Atom-7B-Chat) 模型已下载到 `/models/Atom-7B-Chat` 目录,结果会存到 当前执行命令的`workspace` 文件夹
|
| 15 |
+
|
| 16 |
+
```shell
|
| 17 |
+
lmdeploy convert llama2 /models/Atom-7B-Chat
|
| 18 |
+
```
|
| 19 |
+
lmdeploy 修改一处bug
|
| 20 |
+
```
|
| 21 |
+
sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/preprocessing/1/tokenizer/tokenizer.py
|
| 22 |
+
sed -i 's/from .utils import get_logger/from transformers.utils.logging import get_logger/g' ./workspace/model_repository/postprocessing/1/tokenizer/tokenizer.py
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
## 三、kv cache int8 量化
|
| 27 |
+
对于最大长度是 2048 的 Atom-7B fp16 模型,服务端每创建 1 个并发,都需要大约 1030MB 显存保存 kv_cache,即便是 A100 80G,能服务的用户也非常有限。
|
| 28 |
+
为了降低运行时显存,lmdeploy 实现了 kv cache PTQ 量化,同样的显存可以服务更多并发用户。
|
| 29 |
+
首先计算模型参数,保存到临时目录 atom
|
| 30 |
+
```shell
|
| 31 |
+
mkdir atom
|
| 32 |
+
lmdeploy lite calibrate \
|
| 33 |
+
/models/Atom-7B-Chat \ # huggingface Atom 模型。也支持 llama/vicuna/internlm/baichuan 等
|
| 34 |
+
--calib-dataset 'ptb' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval
|
| 35 |
+
--calib-samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小
|
| 36 |
+
--device 'cuda' \ # 单条的文本长度,如果显存不够,可以适当调小
|
| 37 |
+
--work-dir atom # 保存 pth 格式量化统计参数和量化后权重的文件夹
|
| 38 |
+
```
|
| 39 |
+
注意:可能需要安装flash_attn
|
| 40 |
+
```shell
|
| 41 |
+
conda install -c nvidia cuda-nvcc # 为了使用conda内的cuda环境安装 flash_attn
|
| 42 |
+
pip install flash_attn
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
然后用 atom 目录里的参数,计算量化参数,保存到转换后参数到 `workspace/triton_models/weights` 下
|
| 47 |
+
|
| 48 |
+
```shell
|
| 49 |
+
lmdeploy lite kv_qparams \
|
| 50 |
+
./atom \ # 上一步计算的 atom 结果
|
| 51 |
+
./workspace/triton_models/weights \ # 结果保存目录
|
| 52 |
+
--num-tp 1 # tensor parallel GPU 个数
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
修改推理配置,开启 kv cache int8。编辑 `workspace/triton_models/weights/config.ini`
|
| 56 |
+
* 把 `use_context_fmha` 改为 0,表示关闭 flashattention
|
| 57 |
+
* 把 `quant_policy` 设为 4,表示打开 kv cache 量化
|
| 58 |
+
|
| 59 |
+
最终执行测试即可
|
| 60 |
+
```shell
|
| 61 |
+
lmdeploy chat turbomind ./workspace
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
[点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/kv_int8.md) 查看 kv cache int8 量化实现公式、精度和显存测试报告。
|
| 65 |
+
|
| 66 |
+
## 四、weight int4 量化
|
| 67 |
+
|
| 68 |
+
lmdeploy 基于 [AWQ 算法](https://arxiv.org/abs/2306.00978) 实现了 weight int4 量化,性能是 FP16 的 2.4 倍以上。显存从 16G 降低到 6.3G。
|
| 69 |
+
|
| 70 |
+
对于自己的模型,可以用`auto_awq`工具来优化
|
| 71 |
+
```shell
|
| 72 |
+
# 指定量化导出的模型路径
|
| 73 |
+
WORK_DIR="./atom-7b-chta-w4"
|
| 74 |
+
|
| 75 |
+
lmdeploy lite auto_awq \
|
| 76 |
+
$HF_MODEL \ # huggingface 模型位置
|
| 77 |
+
--calib-dataset 'ptb' \ # 校准数据集,支持 c4, ptb, wikitext2, pileval
|
| 78 |
+
--calib-samples 128 \ # 校准集的样本数,如果显存不够,可以适当调小
|
| 79 |
+
--calib-seqlen 2048 \ # 单条的文本长度,如果显存不够,可以适当调小
|
| 80 |
+
--w-bits 4 \ # 权重量化的 bit 数
|
| 81 |
+
--w-group-size 128 \ # 权重量化分组统计尺寸
|
| 82 |
+
--work-dir $WORK_DIR
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
执行以下命令,启动服务:
|
| 86 |
+
```shell
|
| 87 |
+
# 这里的路径是上面步骤一中转换模型的layout的输出
|
| 88 |
+
FasterTransformer_PATH="/path/workspace"
|
| 89 |
+
|
| 90 |
+
TP=1
|
| 91 |
+
# 指定需要用的显卡
|
| 92 |
+
DEVICES="0"
|
| 93 |
+
for ((i = 1; i < ${TP}; ++i)); do
|
| 94 |
+
DEVICES="${DEVICES},$i"
|
| 95 |
+
done
|
| 96 |
+
DEVICES="\"device=${DEVICES}\""
|
| 97 |
+
|
| 98 |
+
# 在容器内启动服务
|
| 99 |
+
docker run -idt \
|
| 100 |
+
--gpus $DEVICES \
|
| 101 |
+
-v $FasterTransformer_PATH:/workspace/models \
|
| 102 |
+
--shm-size 16g \
|
| 103 |
+
-p 33336:22 \
|
| 104 |
+
-p 33337-33400:33337-33400 \
|
| 105 |
+
--cap-add=SYS_PTRACE \
|
| 106 |
+
--cap-add=SYS_ADMIN \
|
| 107 |
+
--security-opt seccomp=unconfined \
|
| 108 |
+
--name lmdeploy \
|
| 109 |
+
--env NCCL_LAUNCH_MODE=GROUP openmmlab/lmdeploy:latest \
|
| 110 |
+
tritonserver \
|
| 111 |
+
--model-repository=/workspace/models/model_repository \
|
| 112 |
+
--allow-http=0 \
|
| 113 |
+
--allow-grpc=1 \
|
| 114 |
+
--grpc-port=33337 \
|
| 115 |
+
--log-verbose=0 \
|
| 116 |
+
--allow-metrics=1
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
客户端测试:
|
| 120 |
+
```shell
|
| 121 |
+
python test_api_server.py --tritonserver_addr 127.0.0.1:33337
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
[点击这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md) 查看 weight int4 量化的显存和速度测试结果。
|
| 125 |
+
|
| 126 |
+
额外说明,weight int4 和 kv cache int8 二者并不��突、可以同时打开,节约更多显存。
|
Llama-Chinese/inference-speed/GPU/lmdeploy_example/test_api_server.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
from lmdeploy.serve.turbomind.chatbot import Chatbot
|
| 4 |
+
|
| 5 |
+
def input_prompt(chat_history, system_prompt: str):
|
| 6 |
+
"""Input a prompt in the consolo interface."""
|
| 7 |
+
prompt = ''
|
| 8 |
+
for input_text_one in chat_history:
|
| 9 |
+
prompt += "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>"
|
| 10 |
+
if chat_history[-1]['role']=='Human':
|
| 11 |
+
prompt += "<s>Assistant: "
|
| 12 |
+
else:
|
| 13 |
+
prompt += "<s>Human: "
|
| 14 |
+
prompt = prompt[-2048:]
|
| 15 |
+
if len(system_prompt)>0:
|
| 16 |
+
prompt = '<s>System: '+system_prompt.strip()+'\n</s>'+prompt
|
| 17 |
+
|
| 18 |
+
return prompt
|
| 19 |
+
|
| 20 |
+
def main(tritonserver_addr: str,
|
| 21 |
+
session_id: int = 1,
|
| 22 |
+
cap: str = 'chat',
|
| 23 |
+
stream_output: bool = True,
|
| 24 |
+
**kwargs):
|
| 25 |
+
"""An example to communicate with inference server through the command line
|
| 26 |
+
interface.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
tritonserver_addr (str): the address in format "ip:port" of
|
| 30 |
+
triton inference server
|
| 31 |
+
session_id (int): the identical id of a session
|
| 32 |
+
cap (str): the capability of a model. For example, codellama has
|
| 33 |
+
the ability among ['completion', 'infill', 'instruct', 'python']
|
| 34 |
+
stream_output (bool): indicator for streaming output or not
|
| 35 |
+
**kwargs (dict): other arguments for initializing model's chat template
|
| 36 |
+
"""
|
| 37 |
+
log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
|
| 38 |
+
kwargs.update(capability=cap)
|
| 39 |
+
chatbot = Chatbot(tritonserver_addr,
|
| 40 |
+
log_level=log_level,
|
| 41 |
+
display=stream_output,
|
| 42 |
+
**kwargs)
|
| 43 |
+
nth_round = 1
|
| 44 |
+
prompt = input_prompt([{"role": "Human", "content" : "心情不好怎么办"}], "")
|
| 45 |
+
|
| 46 |
+
request_id = f'{session_id}-{nth_round}'
|
| 47 |
+
begin = time.time()
|
| 48 |
+
if stream_output:
|
| 49 |
+
for status, res, n_token in chatbot.stream_infer(
|
| 50 |
+
session_id,
|
| 51 |
+
prompt,
|
| 52 |
+
request_id=request_id,
|
| 53 |
+
request_output_len=512):
|
| 54 |
+
# print("n_token:", n_token)
|
| 55 |
+
continue
|
| 56 |
+
|
| 57 |
+
else:
|
| 58 |
+
status, res, n_token = chatbot.infer(session_id,
|
| 59 |
+
prompt,
|
| 60 |
+
request_id=request_id,
|
| 61 |
+
request_output_len=512)
|
| 62 |
+
print(res)
|
| 63 |
+
# print("n_token:", n_token)
|
| 64 |
+
nth_round += 1
|
| 65 |
+
end = time.time()
|
| 66 |
+
speed = n_token/(end-begin)
|
| 67 |
+
print("speed {} tokens/s".format(speed))
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == '__main__':
|
| 71 |
+
import fire
|
| 72 |
+
|
| 73 |
+
fire.Fire(main)
|
Llama-Chinese/inference-speed/GPU/vllm_example/README.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# vllm推理部署
|
| 2 |
+
|
| 3 |
+
[vllm](https://github.com/vllm-project/vllm)同样是GPU推理的方案。相比较与FasterTrainsformer,vllm更加的简单易用。不需要额外进行模型的转换。支持fp16推理。
|
| 4 |
+
|
| 5 |
+
特点:
|
| 6 |
+
|
| 7 |
+
+ 快速的推理速度
|
| 8 |
+
+ 高效的kv cache
|
| 9 |
+
+ 连续的batch请求推理
|
| 10 |
+
+ 优化cuda算子
|
| 11 |
+
+ 支持分布式推理
|
| 12 |
+
|
| 13 |
+
## 第一步: 安装vllm
|
| 14 |
+
|
| 15 |
+
```bash
|
| 16 |
+
pip install vllm
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
## 第二步:启动测试server
|
| 20 |
+
|
| 21 |
+
从Huggingface下载Atom或者LLama3模型:
|
| 22 |
+
```
|
| 23 |
+
# 您可以选择具体想部署的模型下载
|
| 24 |
+
git clone https://huggingface.co/FlagAlpha/Atom-7B-Chat Atom-7B-Chat
|
| 25 |
+
|
| 26 |
+
# 或者下载Meta官方的Llama3模型:
|
| 27 |
+
git clone https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct Meta-Llama-3-8B-Instruct
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
1. 单卡推理
|
| 31 |
+
|
| 32 |
+
编辑single_gpus_api_server.sh里面model为上面模型的下载路径。
|
| 33 |
+
|
| 34 |
+
启动测试server
|
| 35 |
+
```bash
|
| 36 |
+
# multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡
|
| 37 |
+
bash single_gpus_api_server.sh
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
2. 多卡推理
|
| 41 |
+
|
| 42 |
+
13B模型,70B模型推荐多卡推理。编辑multi_gpus_api_server.sh里面model为上面的13B模型的下载路径。
|
| 43 |
+
|
| 44 |
+
启动测试server
|
| 45 |
+
```bash
|
| 46 |
+
# multi_gpus_api_server.sh 里面的CUDA_VISIBLE_DEVICES指定了要使用的GPU卡
|
| 47 |
+
# tensor-parallel-size 指定了卡的个数
|
| 48 |
+
bash multi_gpus_api_server.sh
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
## 第三步:启动client测试
|
| 52 |
+
|
| 53 |
+
注意下面的model_source 模型的源,可以是 llama_chinese、llama2_meta、llama3_meta 根据下载的模型不同去区分,如果下载的是[FlagAlpha](https://huggingface.co/FlagAlpha)下载的则用llama_chinese。
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
python client_test.py --model_source llama_chinese
|
| 57 |
+
```
|
Llama-Chinese/inference-speed/GPU/vllm_example/api_server.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
from typing import AsyncGenerator
|
| 4 |
+
|
| 5 |
+
from fastapi import BackgroundTasks, FastAPI, Request
|
| 6 |
+
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
from vllm.engine.arg_utils import AsyncEngineArgs
|
| 10 |
+
from vllm.engine.async_llm_engine import AsyncLLMEngine
|
| 11 |
+
from vllm.sampling_params import SamplingParams
|
| 12 |
+
from vllm.utils import random_uuid
|
| 13 |
+
|
| 14 |
+
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
| 15 |
+
TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds.
|
| 16 |
+
app = FastAPI()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@app.post("/generate")
|
| 20 |
+
async def generate(request: Request) -> Response:
|
| 21 |
+
"""Generate completion for the request.
|
| 22 |
+
|
| 23 |
+
The request should be a JSON object with the following fields:
|
| 24 |
+
- prompt: the prompt to use for the generation.
|
| 25 |
+
- stream: whether to stream the results or not.
|
| 26 |
+
- other fields: the sampling parameters (See `SamplingParams` for details).
|
| 27 |
+
"""
|
| 28 |
+
request_dict = await request.json()
|
| 29 |
+
prompt = request_dict.pop("prompt")
|
| 30 |
+
stream = request_dict.pop("stream", False)
|
| 31 |
+
sampling_params = SamplingParams(**request_dict)
|
| 32 |
+
request_id = random_uuid()
|
| 33 |
+
results_generator = engine.generate(prompt, sampling_params, request_id)
|
| 34 |
+
|
| 35 |
+
# Streaming case
|
| 36 |
+
async def stream_results() -> AsyncGenerator[bytes, None]:
|
| 37 |
+
async for request_output in results_generator:
|
| 38 |
+
prompt = request_output.prompt
|
| 39 |
+
text_outputs = [
|
| 40 |
+
prompt + output.text for output in request_output.outputs
|
| 41 |
+
]
|
| 42 |
+
ret = {"text": text_outputs}
|
| 43 |
+
yield (json.dumps(ret) + "\0").encode("utf-8")
|
| 44 |
+
|
| 45 |
+
async def abort_request() -> None:
|
| 46 |
+
await engine.abort(request_id)
|
| 47 |
+
|
| 48 |
+
if stream:
|
| 49 |
+
background_tasks = BackgroundTasks()
|
| 50 |
+
# Abort the request if the client disconnects.
|
| 51 |
+
background_tasks.add_task(abort_request)
|
| 52 |
+
return StreamingResponse(stream_results(), background=background_tasks)
|
| 53 |
+
|
| 54 |
+
# Non-streaming case
|
| 55 |
+
final_output = None
|
| 56 |
+
async for request_output in results_generator:
|
| 57 |
+
if await request.is_disconnected():
|
| 58 |
+
# Abort the request if the client disconnects.
|
| 59 |
+
await engine.abort(request_id)
|
| 60 |
+
return Response(status_code=499)
|
| 61 |
+
final_output = request_output
|
| 62 |
+
|
| 63 |
+
assert final_output is not None
|
| 64 |
+
prompt = final_output.prompt
|
| 65 |
+
text_outputs = [prompt + output.text for output in final_output.outputs]
|
| 66 |
+
ret = {"text": text_outputs}
|
| 67 |
+
return JSONResponse(ret)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
parser = argparse.ArgumentParser()
|
| 72 |
+
parser.add_argument("--host", type=str, default="0.0.0.0")
|
| 73 |
+
parser.add_argument("--port", type=int, default=8090)
|
| 74 |
+
parser.add_argument("--trust_remote_code", type=bool, default=True)
|
| 75 |
+
parser = AsyncEngineArgs.add_cli_args(parser)
|
| 76 |
+
args = parser.parse_args()
|
| 77 |
+
|
| 78 |
+
engine_args = AsyncEngineArgs.from_cli_args(args)
|
| 79 |
+
engine = AsyncLLMEngine.from_engine_args(engine_args)
|
| 80 |
+
|
| 81 |
+
uvicorn.run(app,
|
| 82 |
+
host=args.host,
|
| 83 |
+
port=args.port,
|
| 84 |
+
log_level="debug",
|
| 85 |
+
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
|
Llama-Chinese/inference-speed/GPU/vllm_example/client_test.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
import json
|
| 3 |
+
import time
|
| 4 |
+
import argparse
|
| 5 |
+
|
| 6 |
+
import urllib.request
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
parser = argparse.ArgumentParser()
|
| 11 |
+
parser.add_argument('--model_source', default="llama_chinese", choices =["llama_chinese", "llama2_meta", "llama3_meta"], required=False,type=str)
|
| 12 |
+
args = parser.parse_args()
|
| 13 |
+
|
| 14 |
+
def get_prompt_llama_chinese(
|
| 15 |
+
chat_history, system_prompt=""
|
| 16 |
+
) -> str:
|
| 17 |
+
prompt = ''
|
| 18 |
+
for input_text_one in chat_history:
|
| 19 |
+
prompt += "<s>"+input_text_one['role']+": "+input_text_one['content'].strip()+"\n</s>"
|
| 20 |
+
if chat_history[-1]['role']=='Human':
|
| 21 |
+
prompt += "<s>Assistant: "
|
| 22 |
+
else:
|
| 23 |
+
prompt += "<s>Human: "
|
| 24 |
+
prompt = prompt[-2048:]
|
| 25 |
+
if len(system_prompt)>0:
|
| 26 |
+
prompt = '<s>System: '+system_prompt.strip()+'\n</s>'+prompt
|
| 27 |
+
|
| 28 |
+
return prompt
|
| 29 |
+
|
| 30 |
+
def get_prompt_llama2_meta(chat_history, system_prompt=""):
|
| 31 |
+
B_INST, E_INST = "[INST]", "[/INST]"
|
| 32 |
+
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
|
| 33 |
+
|
| 34 |
+
sep = " "
|
| 35 |
+
sep2 =" </s><s>"
|
| 36 |
+
stop_token_ids = [2]
|
| 37 |
+
system_template = f"[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n"
|
| 38 |
+
roles = ("[INST]", "[/INST]")
|
| 39 |
+
seps = [sep, sep2]
|
| 40 |
+
if system_prompt.strip() != "":
|
| 41 |
+
ret = system_template
|
| 42 |
+
else:
|
| 43 |
+
ret = "[INST] "
|
| 44 |
+
for i, chat in enumerate(chat_history):
|
| 45 |
+
message = chat["content"]
|
| 46 |
+
role = chat["role"]
|
| 47 |
+
if message:
|
| 48 |
+
if i == 0:
|
| 49 |
+
ret += message + " "
|
| 50 |
+
else:
|
| 51 |
+
if role == "Human":
|
| 52 |
+
ret += "[INST]" + " " + message + seps[i % 2]
|
| 53 |
+
else:
|
| 54 |
+
ret += "[/INST]" + " " + message + seps[i % 2]
|
| 55 |
+
else:
|
| 56 |
+
if role == "Human":
|
| 57 |
+
ret += "[INST]"
|
| 58 |
+
else:
|
| 59 |
+
ret += "[/INST]"
|
| 60 |
+
print("prompt:{}".format(ret))
|
| 61 |
+
return ret
|
| 62 |
+
|
| 63 |
+
def get_prompt_llama3_meta(chat_history, system_prompt=""):
|
| 64 |
+
system_format='<|start_header_id|>system<|end_header_id|>\n\n{content}<|eot_id|>'
|
| 65 |
+
user_format='<|start_header_id|>user<|end_header_id|>\n\n{content}<|eot_id|>'
|
| 66 |
+
assistant_format='<|start_header_id|>assistant<|end_header_id|>\n\n{content}<|eot_id|>\n'
|
| 67 |
+
prompt_str = ''
|
| 68 |
+
# 拼接历史对话
|
| 69 |
+
for item in chat_history:
|
| 70 |
+
if item['role']=='Human':
|
| 71 |
+
prompt_str+=user_format.format(content=item['content'])
|
| 72 |
+
else:
|
| 73 |
+
prompt_str+=assistant_format.format(content=item['content'])
|
| 74 |
+
if len(system_prompt)>0:
|
| 75 |
+
prompt_str = system_format.format(content=system_prompt) + prompt_str
|
| 76 |
+
prompt_str = "<|begin_of_text|>" + prompt_str
|
| 77 |
+
return prompt_str
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_api_server(chat_history=[], system_prompt=""):
|
| 81 |
+
header = {'Content-Type': 'application/json'}
|
| 82 |
+
|
| 83 |
+
if args.model_source == "llama2_meta":
|
| 84 |
+
prompt = get_prompt_llama2_meta(chat_history, system_prompt)
|
| 85 |
+
elif args.model_source == "llama3_meta":
|
| 86 |
+
prompt = get_prompt_llama3_meta(chat_history, system_prompt)
|
| 87 |
+
else:
|
| 88 |
+
prompt = get_prompt_llama_chinese(chat_history, system_prompt)
|
| 89 |
+
|
| 90 |
+
data = {
|
| 91 |
+
"prompt": prompt,
|
| 92 |
+
"stream" : False,
|
| 93 |
+
"n" : 1,
|
| 94 |
+
"best_of": 1,
|
| 95 |
+
"presence_penalty": 0.0,
|
| 96 |
+
"frequency_penalty": 0.2,
|
| 97 |
+
"temperature": 0.3,
|
| 98 |
+
"top_p" : 0.95,
|
| 99 |
+
"top_k": 50,
|
| 100 |
+
"use_beam_search": False,
|
| 101 |
+
"stop": [],
|
| 102 |
+
"ignore_eos" :False,
|
| 103 |
+
"max_tokens": 2048,
|
| 104 |
+
"logprobs": None
|
| 105 |
+
}
|
| 106 |
+
request = urllib.request.Request(
|
| 107 |
+
url='http://127.0.0.1:8090/generate',
|
| 108 |
+
headers=header,
|
| 109 |
+
data=json.dumps(data).encode('utf-8')
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
result = None
|
| 113 |
+
try:
|
| 114 |
+
response = urllib.request.urlopen(request, timeout=300)
|
| 115 |
+
res = response.read().decode('utf-8')
|
| 116 |
+
result = json.loads(res)
|
| 117 |
+
print(json.dumps(data, ensure_ascii=False, indent=2))
|
| 118 |
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(e)
|
| 122 |
+
|
| 123 |
+
return result
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
# 多伦对话测试
|
| 127 |
+
""" 多伦对话测试
|
| 128 |
+
last_question = "怎么回来呢"
|
| 129 |
+
inputs = [{"role": "Human", "content": "如何去北京"},
|
| 130 |
+
{"role": "Assitant", "content": "乘坐飞机或者轮船"},
|
| 131 |
+
{"role" : "Human", "content": last_question}]
|
| 132 |
+
"""
|
| 133 |
+
# 单轮对话
|
| 134 |
+
last_question = "怎么去北京"
|
| 135 |
+
chat_history = [ {"role" : "Human", "content": last_question}]
|
| 136 |
+
test_api_server(chat_history)
|
| 137 |
+
|
Llama-Chinese/inference-speed/GPU/vllm_example/multi_gpus_api_server.sh
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
CUDA_VISIBLE_DEVICES=0,1 python api_server.py \
|
| 2 |
+
--model "./Atom-7B-Chat" \
|
| 3 |
+
--port 8090 \
|
| 4 |
+
--tensor-parallel-size 2
|
Llama-Chinese/inference-speed/GPU/vllm_example/single_gpu_api_server.sh
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
CUDA_VISIBLE_DEVICES=0 python api_server.py \
|
| 3 |
+
--model "./Atom-7B-Chat" \
|
| 4 |
+
--port 8090
|
Llama-Chinese/model/Atom-7B-Chat-pre/.gitattributes
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model-00001-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
model-00002-of-00002.safetensors filter=lfs diff=lfs merge=lfs -text
|
Llama-Chinese/model/Atom-7B-Chat-pre/config.json
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "FlagAlpha/Atom-7B-Chat",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"LlamaForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_bias": false,
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"auto_map": {
|
| 9 |
+
"AutoConfig": "FlagAlpha/Atom-7B-Chat--configuration_atom.LlamaConfig",
|
| 10 |
+
"AutoModel": "FlagAlpha/Atom-7B-Chat--model_atom.LlamaForCausalLM",
|
| 11 |
+
"AutoModelForCausalLM": "FlagAlpha/Atom-7B-Chat--model_atom.LlamaForCausalLM",
|
| 12 |
+
"AutoModelForSequenceClassification": "FlagAlpha/Atom-7B-Chat--model_atom.LlamaForSequenceClassification"
|
| 13 |
+
},
|
| 14 |
+
"bos_token_id": 1,
|
| 15 |
+
"eos_token_id": 2,
|
| 16 |
+
"hidden_act": "silu",
|
| 17 |
+
"hidden_size": 4096,
|
| 18 |
+
"initializer_range": 0.02,
|
| 19 |
+
"intermediate_size": 11008,
|
| 20 |
+
"max_length": 4096,
|
| 21 |
+
"max_position_embeddings": 4096,
|
| 22 |
+
"model_type": "llama",
|
| 23 |
+
"num_attention_heads": 32,
|
| 24 |
+
"num_hidden_layers": 32,
|
| 25 |
+
"num_key_value_heads": 32,
|
| 26 |
+
"pad_token_id": 2,
|
| 27 |
+
"pretraining_tp": 1,
|
| 28 |
+
"quantization_config": {
|
| 29 |
+
"_load_in_4bit": false,
|
| 30 |
+
"_load_in_8bit": true,
|
| 31 |
+
"bnb_4bit_compute_dtype": "float32",
|
| 32 |
+
"bnb_4bit_quant_storage": "uint8",
|
| 33 |
+
"bnb_4bit_quant_type": "fp4",
|
| 34 |
+
"bnb_4bit_use_double_quant": false,
|
| 35 |
+
"llm_int8_enable_fp32_cpu_offload": false,
|
| 36 |
+
"llm_int8_has_fp16_weight": false,
|
| 37 |
+
"llm_int8_skip_modules": null,
|
| 38 |
+
"llm_int8_threshold": 6.0,
|
| 39 |
+
"load_in_4bit": false,
|
| 40 |
+
"load_in_8bit": true,
|
| 41 |
+
"quant_method": "bitsandbytes"
|
| 42 |
+
},
|
| 43 |
+
"rms_norm_eps": 1e-05,
|
| 44 |
+
"rope_scaling": null,
|
| 45 |
+
"rope_theta": 500000,
|
| 46 |
+
"tie_word_embeddings": false,
|
| 47 |
+
"torch_dtype": "float16",
|
| 48 |
+
"transformers_version": "4.39.0",
|
| 49 |
+
"use_cache": true,
|
| 50 |
+
"vocab_size": 65000
|
| 51 |
+
}
|
Llama-Chinese/model/Atom-7B-Chat-pre/generation_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"max_length": 4096,
|
| 6 |
+
"pad_token_id": 2,
|
| 7 |
+
"transformers_version": "4.39.0"
|
| 8 |
+
}
|
Llama-Chinese/model/Atom-7B-Chat-pre/model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37de609a2beb1671bd5f8b921144769b332e8e1c5d6630b6c10dab985615458a
|
| 3 |
+
size 4988872000
|
Llama-Chinese/model/Atom-7B-Chat-pre/model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7843f6c0984f493e2d18e58c15791ba55e8c70678f059e83358439b47fba9e6f
|
| 3 |
+
size 2558121080
|
Llama-Chinese/model/Atom-7B-Chat-pre/model.safetensors.index.json
ADDED
|
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 7546937344
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"lm_head.weight": "model-00002-of-00002.safetensors",
|
| 7 |
+
"model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"model.layers.0.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 10 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"model.layers.0.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 12 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"model.layers.0.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 14 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"model.layers.0.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 17 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"model.layers.0.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 19 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"model.layers.0.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 21 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"model.layers.0.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 23 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"model.layers.1.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 26 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"model.layers.1.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 28 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"model.layers.1.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 30 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"model.layers.1.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 33 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"model.layers.1.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 35 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"model.layers.1.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 37 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 38 |
+
"model.layers.1.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 39 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 41 |
+
"model.layers.10.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 42 |
+
"model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"model.layers.10.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 44 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"model.layers.10.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 46 |
+
"model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"model.layers.10.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 49 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 50 |
+
"model.layers.10.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 51 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"model.layers.10.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 53 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"model.layers.10.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 55 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"model.layers.11.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 58 |
+
"model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"model.layers.11.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 60 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"model.layers.11.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 62 |
+
"model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"model.layers.11.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 65 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"model.layers.11.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 67 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"model.layers.11.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 69 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"model.layers.11.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 71 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"model.layers.12.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 74 |
+
"model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"model.layers.12.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 76 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 77 |
+
"model.layers.12.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 78 |
+
"model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 79 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"model.layers.12.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 81 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"model.layers.12.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 83 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"model.layers.12.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 85 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 86 |
+
"model.layers.12.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 87 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 89 |
+
"model.layers.13.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 90 |
+
"model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 91 |
+
"model.layers.13.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 92 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"model.layers.13.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 94 |
+
"model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"model.layers.13.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 97 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 98 |
+
"model.layers.13.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 99 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"model.layers.13.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 101 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"model.layers.13.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 103 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"model.layers.14.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 106 |
+
"model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"model.layers.14.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 108 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"model.layers.14.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 110 |
+
"model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"model.layers.14.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 113 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"model.layers.14.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 115 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"model.layers.14.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 117 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"model.layers.14.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 119 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"model.layers.15.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 122 |
+
"model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"model.layers.15.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 124 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"model.layers.15.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 126 |
+
"model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"model.layers.15.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 129 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 130 |
+
"model.layers.15.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 131 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 132 |
+
"model.layers.15.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 133 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 134 |
+
"model.layers.15.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 135 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 137 |
+
"model.layers.16.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 138 |
+
"model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"model.layers.16.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 140 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"model.layers.16.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 142 |
+
"model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"model.layers.16.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 145 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"model.layers.16.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 147 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"model.layers.16.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 149 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"model.layers.16.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 151 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 152 |
+
"model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 153 |
+
"model.layers.17.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 154 |
+
"model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 155 |
+
"model.layers.17.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 156 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 157 |
+
"model.layers.17.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 158 |
+
"model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 159 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 160 |
+
"model.layers.17.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 161 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 162 |
+
"model.layers.17.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 163 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 164 |
+
"model.layers.17.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 165 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 166 |
+
"model.layers.17.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 167 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 168 |
+
"model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 169 |
+
"model.layers.18.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 170 |
+
"model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 171 |
+
"model.layers.18.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 172 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 173 |
+
"model.layers.18.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 174 |
+
"model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 175 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 176 |
+
"model.layers.18.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 177 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 178 |
+
"model.layers.18.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 179 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 180 |
+
"model.layers.18.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 181 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 182 |
+
"model.layers.18.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 183 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 184 |
+
"model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 185 |
+
"model.layers.19.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 186 |
+
"model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 187 |
+
"model.layers.19.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 188 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 189 |
+
"model.layers.19.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 190 |
+
"model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 191 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 192 |
+
"model.layers.19.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 193 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 194 |
+
"model.layers.19.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 195 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 196 |
+
"model.layers.19.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 197 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 198 |
+
"model.layers.19.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 199 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 200 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 201 |
+
"model.layers.2.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 202 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 203 |
+
"model.layers.2.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 204 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 205 |
+
"model.layers.2.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 206 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 207 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 208 |
+
"model.layers.2.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 209 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 210 |
+
"model.layers.2.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 211 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 212 |
+
"model.layers.2.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 213 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 214 |
+
"model.layers.2.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 215 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 216 |
+
"model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 217 |
+
"model.layers.20.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 218 |
+
"model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 219 |
+
"model.layers.20.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 220 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 221 |
+
"model.layers.20.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 222 |
+
"model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 223 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 224 |
+
"model.layers.20.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 225 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 226 |
+
"model.layers.20.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 227 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 228 |
+
"model.layers.20.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 229 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 230 |
+
"model.layers.20.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 231 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 232 |
+
"model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 233 |
+
"model.layers.21.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 234 |
+
"model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 235 |
+
"model.layers.21.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 236 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 237 |
+
"model.layers.21.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 238 |
+
"model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 239 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 240 |
+
"model.layers.21.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 241 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 242 |
+
"model.layers.21.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 243 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 244 |
+
"model.layers.21.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 245 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 246 |
+
"model.layers.21.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 247 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 248 |
+
"model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 249 |
+
"model.layers.22.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 250 |
+
"model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 251 |
+
"model.layers.22.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 252 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 253 |
+
"model.layers.22.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 254 |
+
"model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 255 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 256 |
+
"model.layers.22.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 257 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 258 |
+
"model.layers.22.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 259 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 260 |
+
"model.layers.22.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 261 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 262 |
+
"model.layers.22.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 263 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 264 |
+
"model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 265 |
+
"model.layers.23.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 266 |
+
"model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 267 |
+
"model.layers.23.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 268 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 269 |
+
"model.layers.23.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 270 |
+
"model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 271 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 272 |
+
"model.layers.23.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 273 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 274 |
+
"model.layers.23.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 275 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 276 |
+
"model.layers.23.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 277 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 278 |
+
"model.layers.23.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 279 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 280 |
+
"model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 281 |
+
"model.layers.24.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 282 |
+
"model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 283 |
+
"model.layers.24.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 284 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 285 |
+
"model.layers.24.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 286 |
+
"model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 287 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 288 |
+
"model.layers.24.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 289 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 290 |
+
"model.layers.24.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 291 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 292 |
+
"model.layers.24.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 293 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 294 |
+
"model.layers.24.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 295 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 296 |
+
"model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 297 |
+
"model.layers.25.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 298 |
+
"model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 299 |
+
"model.layers.25.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 300 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 301 |
+
"model.layers.25.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 302 |
+
"model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 303 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 304 |
+
"model.layers.25.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 305 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 306 |
+
"model.layers.25.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 307 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 308 |
+
"model.layers.25.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 309 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 310 |
+
"model.layers.25.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 311 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 312 |
+
"model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 313 |
+
"model.layers.26.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 314 |
+
"model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 315 |
+
"model.layers.26.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 316 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 317 |
+
"model.layers.26.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 318 |
+
"model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 319 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 320 |
+
"model.layers.26.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 321 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 322 |
+
"model.layers.26.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 323 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 324 |
+
"model.layers.26.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 325 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 326 |
+
"model.layers.26.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 327 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 328 |
+
"model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 329 |
+
"model.layers.27.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 330 |
+
"model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 331 |
+
"model.layers.27.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 332 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 333 |
+
"model.layers.27.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 334 |
+
"model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 335 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 336 |
+
"model.layers.27.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 337 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 338 |
+
"model.layers.27.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 339 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 340 |
+
"model.layers.27.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 341 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 342 |
+
"model.layers.27.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 343 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 344 |
+
"model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 345 |
+
"model.layers.28.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 346 |
+
"model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 347 |
+
"model.layers.28.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 348 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 349 |
+
"model.layers.28.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 350 |
+
"model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 351 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 352 |
+
"model.layers.28.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 353 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 354 |
+
"model.layers.28.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 355 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 356 |
+
"model.layers.28.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 357 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 358 |
+
"model.layers.28.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 359 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 360 |
+
"model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 361 |
+
"model.layers.29.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 362 |
+
"model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 363 |
+
"model.layers.29.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 364 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 365 |
+
"model.layers.29.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 366 |
+
"model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 367 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 368 |
+
"model.layers.29.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 369 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 370 |
+
"model.layers.29.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 371 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 372 |
+
"model.layers.29.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 373 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 374 |
+
"model.layers.29.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 375 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 376 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 377 |
+
"model.layers.3.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 378 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 379 |
+
"model.layers.3.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 380 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 381 |
+
"model.layers.3.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 382 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 383 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 384 |
+
"model.layers.3.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 385 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 386 |
+
"model.layers.3.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 387 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 388 |
+
"model.layers.3.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 389 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 390 |
+
"model.layers.3.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 391 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 392 |
+
"model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 393 |
+
"model.layers.30.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 394 |
+
"model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 395 |
+
"model.layers.30.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 396 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 397 |
+
"model.layers.30.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 398 |
+
"model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 399 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 400 |
+
"model.layers.30.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 401 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 402 |
+
"model.layers.30.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 403 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 404 |
+
"model.layers.30.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 405 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 406 |
+
"model.layers.30.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 407 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 408 |
+
"model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 409 |
+
"model.layers.31.mlp.down_proj.SCB": "model-00002-of-00002.safetensors",
|
| 410 |
+
"model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 411 |
+
"model.layers.31.mlp.gate_proj.SCB": "model-00002-of-00002.safetensors",
|
| 412 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 413 |
+
"model.layers.31.mlp.up_proj.SCB": "model-00002-of-00002.safetensors",
|
| 414 |
+
"model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 415 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 416 |
+
"model.layers.31.self_attn.k_proj.SCB": "model-00002-of-00002.safetensors",
|
| 417 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 418 |
+
"model.layers.31.self_attn.o_proj.SCB": "model-00002-of-00002.safetensors",
|
| 419 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 420 |
+
"model.layers.31.self_attn.q_proj.SCB": "model-00002-of-00002.safetensors",
|
| 421 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 422 |
+
"model.layers.31.self_attn.v_proj.SCB": "model-00002-of-00002.safetensors",
|
| 423 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 424 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 425 |
+
"model.layers.4.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 426 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 427 |
+
"model.layers.4.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 428 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 429 |
+
"model.layers.4.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 430 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 431 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 432 |
+
"model.layers.4.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 433 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 434 |
+
"model.layers.4.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 435 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 436 |
+
"model.layers.4.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 437 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 438 |
+
"model.layers.4.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 439 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 440 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 441 |
+
"model.layers.5.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 442 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 443 |
+
"model.layers.5.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 444 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 445 |
+
"model.layers.5.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 446 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 447 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 448 |
+
"model.layers.5.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 449 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 450 |
+
"model.layers.5.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 451 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 452 |
+
"model.layers.5.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 453 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 454 |
+
"model.layers.5.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 455 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 456 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 457 |
+
"model.layers.6.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 458 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 459 |
+
"model.layers.6.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 460 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 461 |
+
"model.layers.6.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 462 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 463 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 464 |
+
"model.layers.6.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 465 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 466 |
+
"model.layers.6.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 467 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 468 |
+
"model.layers.6.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 469 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 470 |
+
"model.layers.6.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 471 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 472 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 473 |
+
"model.layers.7.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 474 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 475 |
+
"model.layers.7.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 476 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 477 |
+
"model.layers.7.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 478 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 479 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 480 |
+
"model.layers.7.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 481 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 482 |
+
"model.layers.7.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 483 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 484 |
+
"model.layers.7.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 485 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 486 |
+
"model.layers.7.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 487 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 488 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 489 |
+
"model.layers.8.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 490 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 491 |
+
"model.layers.8.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 492 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 493 |
+
"model.layers.8.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 494 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 495 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 496 |
+
"model.layers.8.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 497 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 498 |
+
"model.layers.8.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 499 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 500 |
+
"model.layers.8.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 501 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 502 |
+
"model.layers.8.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 503 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 504 |
+
"model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 505 |
+
"model.layers.9.mlp.down_proj.SCB": "model-00001-of-00002.safetensors",
|
| 506 |
+
"model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 507 |
+
"model.layers.9.mlp.gate_proj.SCB": "model-00001-of-00002.safetensors",
|
| 508 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 509 |
+
"model.layers.9.mlp.up_proj.SCB": "model-00001-of-00002.safetensors",
|
| 510 |
+
"model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 511 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 512 |
+
"model.layers.9.self_attn.k_proj.SCB": "model-00001-of-00002.safetensors",
|
| 513 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 514 |
+
"model.layers.9.self_attn.o_proj.SCB": "model-00001-of-00002.safetensors",
|
| 515 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 516 |
+
"model.layers.9.self_attn.q_proj.SCB": "model-00001-of-00002.safetensors",
|
| 517 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 518 |
+
"model.layers.9.self_attn.v_proj.SCB": "model-00001-of-00002.safetensors",
|
| 519 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 520 |
+
"model.norm.weight": "model-00002-of-00002.safetensors"
|
| 521 |
+
}
|
| 522 |
+
}
|
Llama-Chinese/output/train.log
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[2024-08-14 09:27:03,273] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
|
Llama-Chinese/quick_start.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
+
device_map = "cuda:0" if torch.cuda.is_available() else "auto"
|
| 4 |
+
model = AutoModelForCausalLM.from_pretrained('FlagAlpha/Atom-7B-Chat',device_map=device_map,torch_dtype=torch.float16,load_in_8bit=True,trust_remote_code=True,use_flash_attention_2=True)
|
| 5 |
+
model =model.eval()
|
| 6 |
+
tokenizer = AutoTokenizer.from_pretrained('FlagAlpha/Atom-7B-Chat',use_fast=False)
|
| 7 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 8 |
+
input_ids = tokenizer(['<s>Human: 介绍一下中国\n</s><s>Assistant: '], return_tensors="pt",add_special_tokens=False).input_ids
|
| 9 |
+
if torch.cuda.is_available():
|
| 10 |
+
input_ids = input_ids.to('cuda')
|
| 11 |
+
generate_input = {
|
| 12 |
+
"input_ids":input_ids,
|
| 13 |
+
"max_new_tokens":512,
|
| 14 |
+
"do_sample":True,
|
| 15 |
+
"top_k":50,
|
| 16 |
+
"top_p":0.95,
|
| 17 |
+
"temperature":0.3,
|
| 18 |
+
"repetition_penalty":1.3,
|
| 19 |
+
"eos_token_id":tokenizer.eos_token_id,
|
| 20 |
+
"bos_token_id":tokenizer.bos_token_id,
|
| 21 |
+
"pad_token_id":tokenizer.pad_token_id
|
| 22 |
+
}
|
| 23 |
+
generate_ids = model.generate(**generate_input)
|
| 24 |
+
text = tokenizer.decode(generate_ids[0])
|
| 25 |
+
print(text)
|
| 26 |
+
model.save_pretrained('model/Atom-7B-Chat-pre')
|