carsonhxsu commited on
Commit ·
f50d964
1
Parent(s): 4f41ca5
init code
Browse files- .gitattributes +1 -0
- .gitignore +6 -0
- README.md +112 -0
- demo.py +26 -0
- lyra_baichuan/__init__.py +1 -0
- lyra_baichuan/config.py +34 -0
- lyra_baichuan/lyra_baichuan.py +367 -0
- lyra_baichuan/model.py +166 -0
- lyra_baichuan/tokenization_baichuan.py +232 -0
- requirements.txt +6 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.so filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dist/
|
| 2 |
+
*.egg-info/
|
| 3 |
+
__pycache__
|
| 4 |
+
build/
|
| 5 |
+
.vscode
|
| 6 |
+
.idea
|
README.md
CHANGED
|
@@ -1,3 +1,115 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
+
language: en
|
| 4 |
+
tags:
|
| 5 |
+
- LLM
|
| 6 |
+
- Baichuan-7B
|
| 7 |
+
- Baichuan-13B
|
| 8 |
+
- Baichuan2-7B
|
| 9 |
+
- Baichuan2-13B
|
| 10 |
---
|
| 11 |
+
## Model Card for lyraBaichuan
|
| 12 |
+
|
| 13 |
+
lyraBaichuan is currently the **fastest Baichuan models** (Baichuan-7B, Baichuan-13B, Baichuan2-7B, Baichuan2-13B) available. The inference speed of lyraBaichuan has achieved up to **4300+ tokens/s** on A100, up to **2.4x** acceleration upon the torch version.
|
| 14 |
+
|
| 15 |
+
Among its main features are:
|
| 16 |
+
- device: Nvidia GPU with Amperer architecture or Volta architecture (A100 or higher, V100).
|
| 17 |
+
- batch_size: compiled with dynamic batch size, maximum depends on device.
|
| 18 |
+
- MEMOPT mode: significantly optimized VRAM usage and increased speed
|
| 19 |
+
|
| 20 |
+
We use the Baichuan2-7B-Base and Baichuan2-13B-Base model for measurement, but this optimized inference is also applicable to other Baichuan models, including Baichuan-7B and Baichuan-13B.
|
| 21 |
+
|
| 22 |
+
## Speed
|
| 23 |
+
|
| 24 |
+
* Evaluated at tokens/s
|
| 25 |
+
* test on A100 40G
|
| 26 |
+
* MEMOPT mode
|
| 27 |
+
|
| 28 |
+
### Baichuan2-7B-Base
|
| 29 |
+
|
| 30 |
+
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
| 31 |
+
| --- | --- | --- | --- | --- | --- |
|
| 32 |
+
| Torch 2.0.1 | 41.2 | 323.2 | 640.0 | 1256.8 | 2231.0 |
|
| 33 |
+
| lyraXVERSE MEMOPT | 125.9 | 948.1 | 1749.3 | 2974.0 | 4370.1 |
|
| 34 |
+
|
| 35 |
+
### Baichuan2-13B-Base
|
| 36 |
+
|
| 37 |
+
| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
|
| 38 |
+
| --- | --- | --- | --- | --- | --- |
|
| 39 |
+
| Torch 2.0.1 | 40.9 | 307.9 | 555.6 | 1010.4 | 1601.0 |
|
| 40 |
+
| lyraXVERSE MEMOPT | 80.0 | 568.2 | 1124.4 | 1942.6 | 2828.0 |
|
| 41 |
+
|
| 42 |
+
## Docker Environment Recommendation
|
| 43 |
+
|
| 44 |
+
- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
|
| 45 |
+
- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
docker pull nvcr.io/nvidia/pytorch:23.02-py3
|
| 49 |
+
docker run --rm -it --gpus all -v ./:/lyraBaichuan nvcr.io/nvidia/pytorch:23.02-py3
|
| 50 |
+
|
| 51 |
+
pip install -r requirements.txt
|
| 52 |
+
python demo.py
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Uses
|
| 56 |
+
|
| 57 |
+
```python
|
| 58 |
+
from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
|
| 59 |
+
|
| 60 |
+
model_path = "./models/Baichuan2-13B-lyra"
|
| 61 |
+
tokenizer_path = "./models/Baichuan2-13B-lyra"
|
| 62 |
+
inference_dtype = 'fp16'
|
| 63 |
+
prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
|
| 64 |
+
|
| 65 |
+
memopt_mode = 1
|
| 66 |
+
max_output_length = 64
|
| 67 |
+
arch = "Ampere" # Ampere or Volta
|
| 68 |
+
cuda_version = 12 # cuda version, we currently support 11 and 12
|
| 69 |
+
|
| 70 |
+
# To use 7B model, initialize with lyraBaichuan7B
|
| 71 |
+
model = lyraBaichuan13B(model_path,
|
| 72 |
+
tokenizer_path = tokenizer_path,
|
| 73 |
+
dtype = inference_dtype,
|
| 74 |
+
memopt_mode = memopt_mode,
|
| 75 |
+
arch = arch,
|
| 76 |
+
cuda_version = cuda_version)
|
| 77 |
+
|
| 78 |
+
bs = 1
|
| 79 |
+
prompts = [prompt, ] * bs
|
| 80 |
+
output_texts = model.generate(
|
| 81 |
+
prompts, output_length=max_output_length,
|
| 82 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
| 83 |
+
|
| 84 |
+
print(output_texts)
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## Demo Outputs
|
| 88 |
+
|
| 89 |
+
### Baichuan2-13B-Base
|
| 90 |
+
#### input
|
| 91 |
+
|
| 92 |
+
登鹳雀楼->王之涣
|
| 93 |
+
|
| 94 |
+
夜雨寄北->
|
| 95 |
+
|
| 96 |
+
#### output
|
| 97 |
+
|
| 98 |
+
## TODO
|
| 99 |
+
1. Support for int4
|
| 100 |
+
2. Inference for longer context situations
|
| 101 |
+
3. Streaming inference mode.
|
| 102 |
+
|
| 103 |
+
## Citation
|
| 104 |
+
``` bibtex
|
| 105 |
+
@Misc{lyraBaichuan2023,
|
| 106 |
+
author = {Haoxiong Su, Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
|
| 107 |
+
title = {lyraBaichuan: Accelerating Baichuan models to 4300+ tokens/s},
|
| 108 |
+
howpublished = {\url{https://huggingface.co/TMElyralab/lyraBaichuan}},
|
| 109 |
+
year = {2023}
|
| 110 |
+
}
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
## Report bug
|
| 114 |
+
- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraBaichuan
|
| 115 |
+
- report bug with a `[bug]` mark in the title.
|
demo.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
|
| 2 |
+
|
| 3 |
+
model_path = "./models/Baichuan2-13B-lyra"
|
| 4 |
+
tokenizer_path = "./models/Baichuan2-13B-lyra"
|
| 5 |
+
inference_dtype = 'fp16'
|
| 6 |
+
prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
|
| 7 |
+
|
| 8 |
+
memopt_mode = 1
|
| 9 |
+
max_output_length = 64
|
| 10 |
+
arch = "Ampere" # Ampere or Volta
|
| 11 |
+
cuda_version = 12 # cuda version, we currently support 11 and 12
|
| 12 |
+
|
| 13 |
+
model = lyraBaichuan13B(model_path,
|
| 14 |
+
tokenizer_path = tokenizer_path,
|
| 15 |
+
dtype = inference_dtype,
|
| 16 |
+
memopt_mode = memopt_mode,
|
| 17 |
+
arch = arch,
|
| 18 |
+
cuda_version = cuda_version)
|
| 19 |
+
|
| 20 |
+
bs = 1
|
| 21 |
+
prompts = [prompt, ] * bs
|
| 22 |
+
output_texts = model.generate(
|
| 23 |
+
prompts, output_length=max_output_length,
|
| 24 |
+
top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
|
| 25 |
+
|
| 26 |
+
print(output_texts)
|
lyra_baichuan/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
|
lyra_baichuan/config.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import dataclasses
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclasses.dataclass
|
| 6 |
+
class LyraBaichuanParam:
|
| 7 |
+
num_heads: int = 40
|
| 8 |
+
size_per_head: int = 128
|
| 9 |
+
inter_size: int = 13824
|
| 10 |
+
num_layers: int = 40
|
| 11 |
+
vocab_size: int = 39424
|
| 12 |
+
start_id: Optional[int] = 1
|
| 13 |
+
end_id: Optional[int] = 2
|
| 14 |
+
tensor_para_size: int = 1
|
| 15 |
+
pipeline_para_size: int = 1
|
| 16 |
+
remove_padding: bool = True
|
| 17 |
+
shared_contexts_ratio: float = 1.0
|
| 18 |
+
layernorm_eps: float = 1e-6
|
| 19 |
+
weights_data_type: str = "fp16"
|
| 20 |
+
rotary_embedding: int = 128
|
| 21 |
+
use_gptj_residual: bool = False
|
| 22 |
+
|
| 23 |
+
def __post_init__(self):
|
| 24 |
+
if not 0.0 <= self.shared_contexts_ratio <= 1.0:
|
| 25 |
+
raise ValueError(
|
| 26 |
+
f'Got an invalid value of shared_context_ratio '
|
| 27 |
+
f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
|
| 28 |
+
|
| 29 |
+
def asdict(self):
|
| 30 |
+
return dataclasses.asdict(self)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
LYRA_BAICHUAN_PARAM = LyraBaichuanParam()
|
| 34 |
+
LIB_SO_PATH = '/usr/lib/ftlib/libth_transformer.so'
|
lyra_baichuan/lyra_baichuan.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import configparser
|
| 4 |
+
import pathlib
|
| 5 |
+
import typing
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import transformers
|
| 10 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 11 |
+
|
| 12 |
+
from .config import LYRA_BAICHUAN_PARAM, LIB_SO_PATH
|
| 13 |
+
from .model import BaichuanModel
|
| 14 |
+
from .tokenization_baichuan import BaichuanTokenizer
|
| 15 |
+
|
| 16 |
+
class lyraBaichuan7B:
|
| 17 |
+
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
|
| 18 |
+
self.model_path = model_path
|
| 19 |
+
self.tokenizer_path = tokenizer_path
|
| 20 |
+
self.dtype = dtype
|
| 21 |
+
self.memopt_mode = memopt_mode
|
| 22 |
+
self.arch = arch
|
| 23 |
+
self.cuda_version = cuda_version
|
| 24 |
+
|
| 25 |
+
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
| 26 |
+
print("Got model and tokenizer")
|
| 27 |
+
|
| 28 |
+
def load_model_and_tokenizer(self):
|
| 29 |
+
if self.tokenizer_path is None:
|
| 30 |
+
tokenizer_path = self.model_path
|
| 31 |
+
else:
|
| 32 |
+
tokenizer_path = self.tokenizer_path
|
| 33 |
+
|
| 34 |
+
print(f'Loading tokenizer from {tokenizer_path}')
|
| 35 |
+
tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
|
| 36 |
+
|
| 37 |
+
checkpoint_path = pathlib.Path(self.model_path)
|
| 38 |
+
config_path = checkpoint_path / 'config.ini'
|
| 39 |
+
|
| 40 |
+
if config_path.exists():
|
| 41 |
+
# Read model params from config.
|
| 42 |
+
cfg = configparser.ConfigParser()
|
| 43 |
+
cfg.read(config_path)
|
| 44 |
+
model_name = 'baichuan'
|
| 45 |
+
inference_data_type = self.dtype
|
| 46 |
+
if inference_data_type == None:
|
| 47 |
+
inference_data_type = cfg.get(model_name, "weight_data_type")
|
| 48 |
+
model_args = dict(
|
| 49 |
+
head_num=cfg.getint(model_name, 'head_num'),
|
| 50 |
+
size_per_head=cfg.getint(model_name, "size_per_head"),
|
| 51 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
| 52 |
+
layer_num=cfg.getint(model_name, "num_layer"),
|
| 53 |
+
rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
|
| 54 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
| 55 |
+
vocab_size=cfg.getint(model_name, "vocab_size"),
|
| 56 |
+
start_id=cfg.getint(model_name, "start_id"),
|
| 57 |
+
end_id=cfg.getint(model_name, "end_id"),
|
| 58 |
+
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
| 59 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
| 60 |
+
inference_data_type=inference_data_type)
|
| 61 |
+
else:
|
| 62 |
+
inference_data_type = self.dtype
|
| 63 |
+
if inference_data_type == None:
|
| 64 |
+
inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
|
| 65 |
+
model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
|
| 66 |
+
size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
|
| 67 |
+
inter_size=LYRA_BAICHUAN_PARAM.inter_size,
|
| 68 |
+
layer_num=LYRA_BAICHUAN_PARAM.num_layers,
|
| 69 |
+
rotary_embedding_dim=LYRA_BAICHUAN_PARAM.rotary_embedding,
|
| 70 |
+
layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
|
| 71 |
+
vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
|
| 72 |
+
start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
|
| 73 |
+
end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
|
| 74 |
+
weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
|
| 75 |
+
tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
|
| 76 |
+
inference_data_type=inference_data_type)
|
| 77 |
+
|
| 78 |
+
# update common parameters
|
| 79 |
+
|
| 80 |
+
# Load the C++ model into Pytorch model.
|
| 81 |
+
sm = "sm80"
|
| 82 |
+
|
| 83 |
+
if self.arch == "Ampere":
|
| 84 |
+
sm = "sm80"
|
| 85 |
+
elif self.arch == "Volta":
|
| 86 |
+
sm = "sm70"
|
| 87 |
+
else:
|
| 88 |
+
raise Exception(f"unsupported arch: {self.arch}")
|
| 89 |
+
|
| 90 |
+
cu = 'cu11'
|
| 91 |
+
if self.cuda_version == 11:
|
| 92 |
+
cu = 'cu11'
|
| 93 |
+
elif self.cuda_version == 12:
|
| 94 |
+
cu = 'cu12'
|
| 95 |
+
else:
|
| 96 |
+
raise Exception(f"unsupported cuda version: {self.cuda_version}")
|
| 97 |
+
|
| 98 |
+
lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
|
| 99 |
+
|
| 100 |
+
model_args.update(dict(
|
| 101 |
+
lib_path=lib_path,
|
| 102 |
+
model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
|
| 103 |
+
max_seq_len=0, # for position seq embedding
|
| 104 |
+
pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
|
| 105 |
+
use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
|
| 106 |
+
memopt_mode=self.memopt_mode
|
| 107 |
+
))
|
| 108 |
+
|
| 109 |
+
print('[FT][INFO] Load Our FT Highly Optimized Baichuan-7B model')
|
| 110 |
+
for k, v in model_args.items():
|
| 111 |
+
print(f' - {k.ljust(25, ".")}: {v}')
|
| 112 |
+
|
| 113 |
+
# Check sanity and consistency between the model and tokenizer.
|
| 114 |
+
checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
|
| 115 |
+
'tensor_para_size', 'tensor_para_size', 'weights_data_type']
|
| 116 |
+
if None in [model_args[k] for k in checklist]:
|
| 117 |
+
none_params = [p for p in checklist if model_args[p] is None]
|
| 118 |
+
print(f'[FT][WARNING] Found None parameters {none_params}. They must '
|
| 119 |
+
f'be provided either by config file or CLI arguments.')
|
| 120 |
+
if model_args['start_id'] != tokenizer.bos_token_id:
|
| 121 |
+
print('[FT][WARNING] Given start_id is not matched with the bos token '
|
| 122 |
+
'id of the pretrained tokenizer.')
|
| 123 |
+
if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
|
| 124 |
+
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
| 125 |
+
'token id nor eos token id of the pretrained tokenizer.')
|
| 126 |
+
|
| 127 |
+
print(f'Loading model from {self.model_path}')
|
| 128 |
+
model = BaichuanModel(**model_args)
|
| 129 |
+
return model, tokenizer
|
| 130 |
+
|
| 131 |
+
def generate(self, prompts: typing.List[str] | str,
|
| 132 |
+
output_length: int = 512,
|
| 133 |
+
beam_width: int = 1,
|
| 134 |
+
top_k: typing.Optional[torch.IntTensor] = 1,
|
| 135 |
+
top_p: typing.Optional[torch.FloatTensor] = 1.0,
|
| 136 |
+
beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
|
| 137 |
+
temperature: typing.Optional[torch.FloatTensor] = 1.0,
|
| 138 |
+
len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
|
| 139 |
+
repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
|
| 140 |
+
presence_penalty: typing.Optional[torch.FloatTensor] = None,
|
| 141 |
+
min_length: typing.Optional[torch.IntTensor] = None,
|
| 142 |
+
bad_words_list: typing.Optional[torch.IntTensor] = None,
|
| 143 |
+
do_sample: bool = False,
|
| 144 |
+
return_output_length: bool = False,
|
| 145 |
+
return_cum_log_probs: int = 0):
|
| 146 |
+
#
|
| 147 |
+
if isinstance(prompts, str):
|
| 148 |
+
prompts = [prompts, ]
|
| 149 |
+
|
| 150 |
+
inputs = prompts
|
| 151 |
+
|
| 152 |
+
batch_size = len(inputs)
|
| 153 |
+
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
| 154 |
+
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
| 155 |
+
|
| 156 |
+
# we must encode the raw prompt text one by one in order to compute the length of the original text.
|
| 157 |
+
input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
|
| 158 |
+
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
| 159 |
+
# after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
|
| 160 |
+
input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
|
| 161 |
+
|
| 162 |
+
random_seed = None
|
| 163 |
+
if do_sample:
|
| 164 |
+
random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
|
| 165 |
+
|
| 166 |
+
outputs = self.model(start_ids=input_token_ids,
|
| 167 |
+
start_lengths=input_lengths,
|
| 168 |
+
output_len=output_length,
|
| 169 |
+
beam_width=beam_width,
|
| 170 |
+
top_k=top_k * ones_int,
|
| 171 |
+
top_p=top_p * ones_float,
|
| 172 |
+
beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
|
| 173 |
+
temperature=temperature * ones_float,
|
| 174 |
+
len_penalty=len_penalty * ones_float,
|
| 175 |
+
repetition_penalty=repetition_penalty * ones_float,
|
| 176 |
+
random_seed=random_seed,
|
| 177 |
+
return_output_length=return_output_length,
|
| 178 |
+
return_cum_log_probs=return_cum_log_probs)
|
| 179 |
+
|
| 180 |
+
if return_cum_log_probs > 0:
|
| 181 |
+
outputs = outputs[0] # output_token_ids.
|
| 182 |
+
|
| 183 |
+
# Slice the generated token ids of the 1st beam result.
|
| 184 |
+
# output = input tokens + generated tokens.
|
| 185 |
+
output_token_ids = [out[0, length:].cpu()
|
| 186 |
+
for out, length in zip(outputs, input_lengths)]
|
| 187 |
+
|
| 188 |
+
output_texts = self.tokenizer.batch_decode(
|
| 189 |
+
output_token_ids, skip_special_tokens=True)
|
| 190 |
+
|
| 191 |
+
return output_texts
|
| 192 |
+
|
| 193 |
+
class lyraBaichuan13B:
|
| 194 |
+
def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
|
| 195 |
+
self.model_path = model_path
|
| 196 |
+
self.tokenizer_path = tokenizer_path
|
| 197 |
+
self.dtype = dtype
|
| 198 |
+
self.memopt_mode = memopt_mode
|
| 199 |
+
self.arch = arch
|
| 200 |
+
self.cuda_version = cuda_version
|
| 201 |
+
|
| 202 |
+
self.model, self.tokenizer = self.load_model_and_tokenizer()
|
| 203 |
+
print("Got model and tokenizer")
|
| 204 |
+
|
| 205 |
+
def load_model_and_tokenizer(self):
|
| 206 |
+
if self.tokenizer_path is None:
|
| 207 |
+
tokenizer_path = self.model_path
|
| 208 |
+
else:
|
| 209 |
+
tokenizer_path = self.tokenizer_path
|
| 210 |
+
|
| 211 |
+
print(f'Loading tokenizer from {tokenizer_path}')
|
| 212 |
+
tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
|
| 213 |
+
|
| 214 |
+
checkpoint_path = pathlib.Path(self.model_path)
|
| 215 |
+
config_path = checkpoint_path / 'config.ini'
|
| 216 |
+
|
| 217 |
+
if config_path.exists():
|
| 218 |
+
# Read model params from config.
|
| 219 |
+
cfg = configparser.ConfigParser()
|
| 220 |
+
cfg.read(config_path)
|
| 221 |
+
model_name = 'baichuan'
|
| 222 |
+
inference_data_type = self.dtype
|
| 223 |
+
if inference_data_type == None:
|
| 224 |
+
inference_data_type = cfg.get(model_name, "weight_data_type")
|
| 225 |
+
model_args = dict(
|
| 226 |
+
head_num=cfg.getint(model_name, 'head_num'),
|
| 227 |
+
size_per_head=cfg.getint(model_name, "size_per_head"),
|
| 228 |
+
inter_size=cfg.getint(model_name, 'inter_size'),
|
| 229 |
+
layer_num=cfg.getint(model_name, "num_layer"),
|
| 230 |
+
rotary_embedding_dim=0,
|
| 231 |
+
layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
|
| 232 |
+
vocab_size=cfg.getint(model_name, "vocab_size"),
|
| 233 |
+
start_id=cfg.getint(model_name, "start_id"),
|
| 234 |
+
end_id=cfg.getint(model_name, "end_id"),
|
| 235 |
+
weights_data_type=cfg.get(model_name, "weight_data_type"),
|
| 236 |
+
tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
|
| 237 |
+
inference_data_type=inference_data_type)
|
| 238 |
+
else:
|
| 239 |
+
inference_data_type = self.dtype
|
| 240 |
+
if inference_data_type == None:
|
| 241 |
+
inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
|
| 242 |
+
model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
|
| 243 |
+
size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
|
| 244 |
+
inter_size=LYRA_BAICHUAN_PARAM.inter_size,
|
| 245 |
+
layer_num=LYRA_BAICHUAN_PARAM.num_layers,
|
| 246 |
+
rotary_embedding_dim=0,
|
| 247 |
+
layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
|
| 248 |
+
vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
|
| 249 |
+
start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
|
| 250 |
+
end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
|
| 251 |
+
weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
|
| 252 |
+
tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
|
| 253 |
+
inference_data_type=inference_data_type)
|
| 254 |
+
|
| 255 |
+
# update common parameters
|
| 256 |
+
# Load the C++ model into Pytorch model.
|
| 257 |
+
sm = "sm80"
|
| 258 |
+
|
| 259 |
+
if self.arch == "Ampere":
|
| 260 |
+
sm = "sm80"
|
| 261 |
+
elif self.arch == "Volta":
|
| 262 |
+
sm = "sm70"
|
| 263 |
+
else:
|
| 264 |
+
raise Exception(f"unsupported arch: {self.arch}")
|
| 265 |
+
|
| 266 |
+
cu = 'cu11'
|
| 267 |
+
if self.cuda_version == 11:
|
| 268 |
+
cu = 'cu11'
|
| 269 |
+
elif self.cuda_version == 12:
|
| 270 |
+
cu = 'cu12'
|
| 271 |
+
else:
|
| 272 |
+
raise Exception(f"unsupported cuda version: {self.cuda_version}")
|
| 273 |
+
|
| 274 |
+
lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
|
| 275 |
+
model_args.update(dict(
|
| 276 |
+
lib_path=lib_path,
|
| 277 |
+
model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
|
| 278 |
+
max_seq_len=0, # for position seq embedding
|
| 279 |
+
pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
|
| 280 |
+
use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
|
| 281 |
+
memopt_mode=self.memopt_mode
|
| 282 |
+
))
|
| 283 |
+
|
| 284 |
+
print('[FT][INFO] Load Our FT Highly Optimized Baichuan-13B model')
|
| 285 |
+
for k, v in model_args.items():
|
| 286 |
+
print(f' - {k.ljust(25, ".")}: {v}')
|
| 287 |
+
|
| 288 |
+
# Check sanity and consistency between the model and tokenizer.
|
| 289 |
+
checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
|
| 290 |
+
'tensor_para_size', 'tensor_para_size', 'weights_data_type']
|
| 291 |
+
if None in [model_args[k] for k in checklist]:
|
| 292 |
+
none_params = [p for p in checklist if model_args[p] is None]
|
| 293 |
+
print(f'[FT][WARNING] Found None parameters {none_params}. They must '
|
| 294 |
+
f'be provided either by config file or CLI arguments.')
|
| 295 |
+
if model_args['start_id'] != tokenizer.bos_token_id:
|
| 296 |
+
print('[FT][WARNING] Given start_id is not matched with the bos token '
|
| 297 |
+
'id of the pretrained tokenizer.')
|
| 298 |
+
if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
|
| 299 |
+
print('[FT][WARNING] Given end_id is not matched with neither pad '
|
| 300 |
+
'token id nor eos token id of the pretrained tokenizer.')
|
| 301 |
+
|
| 302 |
+
print(f'Loading model from {self.model_path}')
|
| 303 |
+
model = BaichuanModel(**model_args)
|
| 304 |
+
return model, tokenizer
|
| 305 |
+
|
| 306 |
+
def generate(self, prompts: typing.List[str] | str,
|
| 307 |
+
output_length: int = 512,
|
| 308 |
+
beam_width: int = 1,
|
| 309 |
+
top_k: typing.Optional[torch.IntTensor] = 1,
|
| 310 |
+
top_p: typing.Optional[torch.FloatTensor] = 1.0,
|
| 311 |
+
beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
|
| 312 |
+
temperature: typing.Optional[torch.FloatTensor] = 1.0,
|
| 313 |
+
len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
|
| 314 |
+
repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
|
| 315 |
+
presence_penalty: typing.Optional[torch.FloatTensor] = None,
|
| 316 |
+
min_length: typing.Optional[torch.IntTensor] = None,
|
| 317 |
+
bad_words_list: typing.Optional[torch.IntTensor] = None,
|
| 318 |
+
do_sample: bool = False,
|
| 319 |
+
return_output_length: bool = False,
|
| 320 |
+
return_cum_log_probs: int = 0):
|
| 321 |
+
#
|
| 322 |
+
if isinstance(prompts, str):
|
| 323 |
+
prompts = [prompts, ]
|
| 324 |
+
|
| 325 |
+
inputs = prompts
|
| 326 |
+
|
| 327 |
+
batch_size = len(inputs)
|
| 328 |
+
ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
|
| 329 |
+
ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
|
| 330 |
+
|
| 331 |
+
# we must encode the raw prompt text one by one in order to compute the length of the original text.
|
| 332 |
+
input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
|
| 333 |
+
input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
|
| 334 |
+
# after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
|
| 335 |
+
input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
|
| 336 |
+
|
| 337 |
+
random_seed = None
|
| 338 |
+
if do_sample:
|
| 339 |
+
random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
|
| 340 |
+
|
| 341 |
+
outputs = self.model(start_ids=input_token_ids,
|
| 342 |
+
start_lengths=input_lengths,
|
| 343 |
+
output_len=output_length,
|
| 344 |
+
beam_width=beam_width,
|
| 345 |
+
top_k=top_k * ones_int,
|
| 346 |
+
top_p=top_p * ones_float,
|
| 347 |
+
beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
|
| 348 |
+
temperature=temperature * ones_float,
|
| 349 |
+
len_penalty=len_penalty * ones_float,
|
| 350 |
+
repetition_penalty=repetition_penalty * ones_float,
|
| 351 |
+
random_seed=random_seed,
|
| 352 |
+
return_output_length=return_output_length,
|
| 353 |
+
return_cum_log_probs=return_cum_log_probs)
|
| 354 |
+
|
| 355 |
+
if return_cum_log_probs > 0:
|
| 356 |
+
outputs = outputs[0] # output_token_ids.
|
| 357 |
+
|
| 358 |
+
# Slice the generated token ids of the 1st beam result.
|
| 359 |
+
# output = input tokens + generated tokens.
|
| 360 |
+
output_token_ids = [out[0, length:].cpu()
|
| 361 |
+
for out, length in zip(outputs, input_lengths)]
|
| 362 |
+
|
| 363 |
+
output_texts = self.tokenizer.batch_decode(
|
| 364 |
+
output_token_ids, skip_special_tokens=True)
|
| 365 |
+
|
| 366 |
+
return output_texts
|
| 367 |
+
|
lyra_baichuan/model.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
from __future__ import print_function
|
| 16 |
+
|
| 17 |
+
import copy
|
| 18 |
+
import os
|
| 19 |
+
import pathlib
|
| 20 |
+
import typing
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
import torch
|
| 24 |
+
import torch.distributed as dist
|
| 25 |
+
import torch.nn as nn
|
| 26 |
+
|
| 27 |
+
str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
|
| 28 |
+
|
| 29 |
+
class BaichuanModel(nn.Module):
|
| 30 |
+
def __init__(self,
|
| 31 |
+
head_num,
|
| 32 |
+
size_per_head,
|
| 33 |
+
inter_size,
|
| 34 |
+
vocab_size,
|
| 35 |
+
rotary_embedding_dim,
|
| 36 |
+
start_id, end_id, layer_num,
|
| 37 |
+
max_seq_len: int,
|
| 38 |
+
layernorm_eps,
|
| 39 |
+
tensor_para_size: int,
|
| 40 |
+
pipeline_para_size: int,
|
| 41 |
+
use_gptj_residual,
|
| 42 |
+
lib_path: typing.Union[str, pathlib.Path],
|
| 43 |
+
model_path,
|
| 44 |
+
memopt_mode: int = 0,
|
| 45 |
+
inference_data_type: str = "fp16",
|
| 46 |
+
weights_data_type: typing.Union[str, np.dtype] = np.float32):
|
| 47 |
+
super().__init__()
|
| 48 |
+
self.head_num = head_num
|
| 49 |
+
self.size_per_head = size_per_head
|
| 50 |
+
self.inter_size = inter_size
|
| 51 |
+
self.vocab_size = vocab_size
|
| 52 |
+
self.rotary_embedding_dim = rotary_embedding_dim
|
| 53 |
+
self.start_id = start_id
|
| 54 |
+
self.end_id = end_id
|
| 55 |
+
self.max_seq_len = max_seq_len
|
| 56 |
+
self.layer_num = layer_num
|
| 57 |
+
self.use_gptj_residual = use_gptj_residual
|
| 58 |
+
self.layernorm_eps = layernorm_eps
|
| 59 |
+
self.memopt_mode = memopt_mode
|
| 60 |
+
|
| 61 |
+
# multi-gpu params
|
| 62 |
+
self.tensor_para_size = tensor_para_size
|
| 63 |
+
self.pipeline_para_size = pipeline_para_size
|
| 64 |
+
self.build_model = False
|
| 65 |
+
self.weights_data_type = weights_data_type
|
| 66 |
+
self.inference_data_type = inference_data_type
|
| 67 |
+
|
| 68 |
+
assert torch.cuda.is_available(), "CUDA is required for this model."
|
| 69 |
+
|
| 70 |
+
assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
|
| 71 |
+
assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
|
| 72 |
+
|
| 73 |
+
# Load the C++ model into Pytorch model.
|
| 74 |
+
torch.classes.load_library(os.path.abspath(lib_path))
|
| 75 |
+
|
| 76 |
+
# Prepare for tensor/pipeline parallel
|
| 77 |
+
try:
|
| 78 |
+
dist.init_process_group(backend='mpi')
|
| 79 |
+
except:
|
| 80 |
+
print("[INFO] WARNING: Have initialized the process group")
|
| 81 |
+
self.rank = dist.get_rank()
|
| 82 |
+
self.device_count = torch.cuda.device_count()
|
| 83 |
+
self.device = self.rank % self.device_count
|
| 84 |
+
torch.cuda.set_device(self.device)
|
| 85 |
+
|
| 86 |
+
world_size = dist.get_world_size()
|
| 87 |
+
# print(tensor_para_size * pipeline_para_size)
|
| 88 |
+
assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
|
| 89 |
+
|
| 90 |
+
self.tensor_para_rank = self.rank % self.tensor_para_size
|
| 91 |
+
self.pipeline_para_rank = self.rank // self.tensor_para_size
|
| 92 |
+
|
| 93 |
+
self.model = torch.classes.FasterTransformer.BaichuanOp(
|
| 94 |
+
self.head_num, self.size_per_head, self.inter_size,
|
| 95 |
+
self.layer_num,
|
| 96 |
+
self.vocab_size,
|
| 97 |
+
self.rotary_embedding_dim,
|
| 98 |
+
self.layernorm_eps,
|
| 99 |
+
self.start_id, self.end_id,
|
| 100 |
+
self.tensor_para_size, self.pipeline_para_size,
|
| 101 |
+
self.max_seq_len,
|
| 102 |
+
self.use_gptj_residual,
|
| 103 |
+
self.memopt_mode,
|
| 104 |
+
model_path,
|
| 105 |
+
self.weights_data_type,
|
| 106 |
+
self.inference_data_type)
|
| 107 |
+
|
| 108 |
+
self.build_model = True
|
| 109 |
+
torch.cuda.empty_cache()
|
| 110 |
+
|
| 111 |
+
def forward(self,
|
| 112 |
+
start_ids: torch.Tensor,
|
| 113 |
+
start_lengths: torch.Tensor,
|
| 114 |
+
output_len,
|
| 115 |
+
beam_width=1,
|
| 116 |
+
top_k: torch.Tensor = None,
|
| 117 |
+
top_p: torch.Tensor = None,
|
| 118 |
+
beam_search_diversity_rate: torch.Tensor = None,
|
| 119 |
+
temperature: torch.Tensor = None,
|
| 120 |
+
len_penalty: torch.Tensor = None,
|
| 121 |
+
repetition_penalty: torch.Tensor = None,
|
| 122 |
+
random_seed: torch.Tensor = None,
|
| 123 |
+
return_output_length=False,
|
| 124 |
+
return_cum_log_probs=0):
|
| 125 |
+
|
| 126 |
+
input_len = start_ids.size(1)
|
| 127 |
+
assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
|
| 128 |
+
|
| 129 |
+
# Inputs to device
|
| 130 |
+
input_ids = start_ids.cuda(self.device)
|
| 131 |
+
input_lengths = start_lengths.cuda(self.device)
|
| 132 |
+
# outputs: output_ids, output_lengths, output_cum_log_probs (optional)
|
| 133 |
+
outputs = self.model.forward(input_ids,
|
| 134 |
+
input_lengths,
|
| 135 |
+
output_len,
|
| 136 |
+
beam_width, # optional, can be None
|
| 137 |
+
top_k, # optional, can be None
|
| 138 |
+
top_p, # optional, can be None
|
| 139 |
+
beam_search_diversity_rate, # optional, can be None
|
| 140 |
+
temperature, # optional, can be None
|
| 141 |
+
len_penalty, # optional, can be None
|
| 142 |
+
repetition_penalty, # optional, can be None
|
| 143 |
+
random_seed, # optional, can be None
|
| 144 |
+
return_cum_log_probs) # optional, can be None
|
| 145 |
+
|
| 146 |
+
if return_cum_log_probs == 0:
|
| 147 |
+
output_ids, output_lengths = outputs
|
| 148 |
+
else:
|
| 149 |
+
output_ids, output_lengths, output_cum_log_probs = outputs
|
| 150 |
+
if return_output_length:
|
| 151 |
+
if return_cum_log_probs > 0:
|
| 152 |
+
return output_ids, output_lengths, output_cum_log_probs
|
| 153 |
+
else:
|
| 154 |
+
return output_ids, output_lengths
|
| 155 |
+
else:
|
| 156 |
+
return output_ids
|
| 157 |
+
|
| 158 |
+
def set_input_tensor(self, input_tensor):
|
| 159 |
+
"""Set input tensor to be used instead of forward()'s input.
|
| 160 |
+
|
| 161 |
+
When doing pipeline parallelism the input from the previous
|
| 162 |
+
stage comes from communication, not from the input, so the
|
| 163 |
+
model's forward_step_func won't have it. This function is thus
|
| 164 |
+
used by internal code to bypass the input provided by the
|
| 165 |
+
forward_step_func"""
|
| 166 |
+
self.input_tensor = input_tensor
|
lyra_baichuan/tokenization_baichuan.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from shutil import copyfile
|
| 5 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 6 |
+
|
| 7 |
+
import sentencepiece as spm
|
| 8 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
| 9 |
+
from transformers.utils import logging
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
logger = logging.get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
|
| 15 |
+
|
| 16 |
+
PRETRAINED_VOCAB_FILES_MAP = {
|
| 17 |
+
"vocab_file": {},
|
| 18 |
+
"tokenizer_file": {},
|
| 19 |
+
}
|
| 20 |
+
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class BaichuanTokenizer(PreTrainedTokenizer):
|
| 24 |
+
"""
|
| 25 |
+
Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
vocab_file (`str`):
|
| 29 |
+
Path to the vocabulary file.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
| 33 |
+
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
| 34 |
+
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
| 35 |
+
model_input_names = ["input_ids", "attention_mask"]
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
vocab_file,
|
| 40 |
+
unk_token="<unk>",
|
| 41 |
+
bos_token="<s>",
|
| 42 |
+
eos_token="</s>",
|
| 43 |
+
pad_token=None,
|
| 44 |
+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
| 45 |
+
add_bos_token=True,
|
| 46 |
+
add_eos_token=False,
|
| 47 |
+
clean_up_tokenization_spaces=False,
|
| 48 |
+
**kwargs,
|
| 49 |
+
):
|
| 50 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
| 51 |
+
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
| 52 |
+
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
| 53 |
+
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
| 54 |
+
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
| 55 |
+
super().__init__(
|
| 56 |
+
bos_token=bos_token,
|
| 57 |
+
eos_token=eos_token,
|
| 58 |
+
unk_token=unk_token,
|
| 59 |
+
pad_token=pad_token,
|
| 60 |
+
add_bos_token=add_bos_token,
|
| 61 |
+
add_eos_token=add_eos_token,
|
| 62 |
+
sp_model_kwargs=self.sp_model_kwargs,
|
| 63 |
+
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
| 64 |
+
**kwargs,
|
| 65 |
+
)
|
| 66 |
+
self.vocab_file = vocab_file
|
| 67 |
+
self.add_bos_token = add_bos_token
|
| 68 |
+
self.add_eos_token = add_eos_token
|
| 69 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
| 70 |
+
self.sp_model.Load(vocab_file)
|
| 71 |
+
|
| 72 |
+
def __getstate__(self):
|
| 73 |
+
state = self.__dict__.copy()
|
| 74 |
+
state["sp_model"] = None
|
| 75 |
+
return state
|
| 76 |
+
|
| 77 |
+
def __setstate__(self, d):
|
| 78 |
+
self.__dict__ = d
|
| 79 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
| 80 |
+
self.sp_model.Load(self.vocab_file)
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def vocab_size(self):
|
| 84 |
+
"""Returns vocab size"""
|
| 85 |
+
return self.sp_model.get_piece_size()
|
| 86 |
+
|
| 87 |
+
def get_vocab(self):
|
| 88 |
+
"""Returns vocab as a dict"""
|
| 89 |
+
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
| 90 |
+
vocab.update(self.added_tokens_encoder)
|
| 91 |
+
return vocab
|
| 92 |
+
|
| 93 |
+
def _tokenize(self, text):
|
| 94 |
+
"""Returns a tokenized string."""
|
| 95 |
+
return self.sp_model.encode(text, out_type=str)
|
| 96 |
+
|
| 97 |
+
def _convert_token_to_id(self, token):
|
| 98 |
+
"""Converts a token (str) in an id using the vocab."""
|
| 99 |
+
return self.sp_model.piece_to_id(token)
|
| 100 |
+
|
| 101 |
+
def _convert_id_to_token(self, index):
|
| 102 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
| 103 |
+
token = self.sp_model.IdToPiece(index)
|
| 104 |
+
return token
|
| 105 |
+
|
| 106 |
+
def convert_tokens_to_string(self, tokens):
|
| 107 |
+
"""Converts a sequence of tokens (string) in a single string."""
|
| 108 |
+
current_sub_tokens = []
|
| 109 |
+
out_string = ""
|
| 110 |
+
prev_is_special = False
|
| 111 |
+
for i, token in enumerate(tokens):
|
| 112 |
+
# make sure that special tokens are not decoded using sentencepiece model
|
| 113 |
+
if token in self.all_special_tokens:
|
| 114 |
+
if not prev_is_special and i != 0:
|
| 115 |
+
out_string += " "
|
| 116 |
+
out_string += self.sp_model.decode(current_sub_tokens) + token
|
| 117 |
+
prev_is_special = True
|
| 118 |
+
current_sub_tokens = []
|
| 119 |
+
else:
|
| 120 |
+
current_sub_tokens.append(token)
|
| 121 |
+
prev_is_special = False
|
| 122 |
+
out_string += self.sp_model.decode(current_sub_tokens)
|
| 123 |
+
return out_string
|
| 124 |
+
|
| 125 |
+
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
| 126 |
+
"""
|
| 127 |
+
Save the vocabulary and special tokens file to a directory.
|
| 128 |
+
|
| 129 |
+
Args:
|
| 130 |
+
save_directory (`str`):
|
| 131 |
+
The directory in which to save the vocabulary.
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
`Tuple(str)`: Paths to the files saved.
|
| 135 |
+
"""
|
| 136 |
+
if not os.path.isdir(save_directory):
|
| 137 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
| 138 |
+
return
|
| 139 |
+
out_vocab_file = os.path.join(
|
| 140 |
+
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
| 144 |
+
copyfile(self.vocab_file, out_vocab_file)
|
| 145 |
+
elif not os.path.isfile(self.vocab_file):
|
| 146 |
+
with open(out_vocab_file, "wb") as fi:
|
| 147 |
+
content_spiece_model = self.sp_model.serialized_model_proto()
|
| 148 |
+
fi.write(content_spiece_model)
|
| 149 |
+
|
| 150 |
+
return (out_vocab_file,)
|
| 151 |
+
|
| 152 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
| 153 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
| 154 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
| 155 |
+
|
| 156 |
+
output = bos_token_id + token_ids_0 + eos_token_id
|
| 157 |
+
|
| 158 |
+
if token_ids_1 is not None:
|
| 159 |
+
output = output + bos_token_id + token_ids_1 + eos_token_id
|
| 160 |
+
|
| 161 |
+
return output
|
| 162 |
+
|
| 163 |
+
def get_special_tokens_mask(
|
| 164 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
| 165 |
+
) -> List[int]:
|
| 166 |
+
"""
|
| 167 |
+
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
| 168 |
+
special tokens using the tokenizer `prepare_for_model` method.
|
| 169 |
+
|
| 170 |
+
Args:
|
| 171 |
+
token_ids_0 (`List[int]`):
|
| 172 |
+
List of IDs.
|
| 173 |
+
token_ids_1 (`List[int]`, *optional*):
|
| 174 |
+
Optional second list of IDs for sequence pairs.
|
| 175 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
| 176 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
| 180 |
+
"""
|
| 181 |
+
if already_has_special_tokens:
|
| 182 |
+
return super().get_special_tokens_mask(
|
| 183 |
+
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
bos_token_id = [1] if self.add_bos_token else []
|
| 187 |
+
eos_token_id = [1] if self.add_eos_token else []
|
| 188 |
+
|
| 189 |
+
if token_ids_1 is None:
|
| 190 |
+
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
| 191 |
+
return (
|
| 192 |
+
bos_token_id
|
| 193 |
+
+ ([0] * len(token_ids_0))
|
| 194 |
+
+ eos_token_id
|
| 195 |
+
+ bos_token_id
|
| 196 |
+
+ ([0] * len(token_ids_1))
|
| 197 |
+
+ eos_token_id
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
def create_token_type_ids_from_sequences(
|
| 201 |
+
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 202 |
+
) -> List[int]:
|
| 203 |
+
"""
|
| 204 |
+
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
| 205 |
+
sequence pair mask has the following format:
|
| 206 |
+
|
| 207 |
+
```
|
| 208 |
+
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
| 209 |
+
| first sequence | second sequence |
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
token_ids_0 (`List[int]`):
|
| 216 |
+
List of ids.
|
| 217 |
+
token_ids_1 (`List[int]`, *optional*):
|
| 218 |
+
Optional second list of IDs for sequence pairs.
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
| 222 |
+
"""
|
| 223 |
+
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
| 224 |
+
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
| 225 |
+
|
| 226 |
+
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
| 227 |
+
|
| 228 |
+
if token_ids_1 is not None:
|
| 229 |
+
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
| 230 |
+
|
| 231 |
+
return output
|
| 232 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
numpy
|
| 3 |
+
setuptools
|
| 4 |
+
torch
|
| 5 |
+
bfloat16
|
| 6 |
+
sentencepiece
|