init code

Files changed (10) hide show

.gitattributes +1 -0
.gitignore +6 -0
README.md +112 -0
demo.py +26 -0
lyra_baichuan/__init__.py +1 -0
lyra_baichuan/config.py +34 -0
lyra_baichuan/lyra_baichuan.py +367 -0
lyra_baichuan/model.py +166 -0
lyra_baichuan/tokenization_baichuan.py +232 -0
requirements.txt +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+dist/
+*.egg-info/
+__pycache__
+build/
+.vscode
+.idea

README.md CHANGED Viewed

@@ -1,3 +1,115 @@
 ---
 license: mit
 ---

 ---
 license: mit
+language: en
+tags:
+- LLM
+- Baichuan-7B
+- Baichuan-13B
+- Baichuan2-7B
+- Baichuan2-13B
 ---
+## Model Card for lyraBaichuan
+lyraBaichuan is currently the **fastest Baichuan models** (Baichuan-7B, Baichuan-13B, Baichuan2-7B, Baichuan2-13B) available. The inference speed of lyraBaichuan has achieved up to **4300+ tokens/s** on A100, up to **2.4x** acceleration upon the torch version.
+Among its main features are:
+- device: Nvidia GPU with Amperer architecture or Volta architecture (A100 or higher, V100).
+- batch_size: compiled with dynamic batch size, maximum depends on device.
+- MEMOPT mode: significantly optimized VRAM usage and increased speed
+We use the Baichuan2-7B-Base and Baichuan2-13B-Base model for measurement, but this optimized inference is also applicable to other Baichuan models, including Baichuan-7B and Baichuan-13B.
+## Speed
+* Evaluated at tokens/s
+* test on A100 40G
+* MEMOPT mode
+### Baichuan2-7B-Base
+| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
+| --- | --- | --- | --- | --- | --- |
+| Torch 2.0.1 | 41.2 | 323.2 | 640.0 | 1256.8 | 2231.0 |
+| lyraXVERSE MEMOPT | 125.9 | 948.1 | 1749.3 | 2974.0 | 4370.1 |
+### Baichuan2-13B-Base
+| Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
+| --- | --- | --- | --- | --- | --- |
+| Torch 2.0.1 | 40.9 | 307.9 | 555.6 | 1010.4 | 1601.0 |
+| lyraXVERSE MEMOPT | 80.0 | 568.2 | 1124.4 | 1942.6 | 2828.0 |
+## Docker Environment Recommendation
+- For Cuda 11.X: we recommend ```nvcr.io/nvidia/pytorch:22.12-py3```
+- For Cuda 12.0: we recommend ```nvcr.io/nvidia/pytorch:23.02-py3```
+```bash
+docker pull nvcr.io/nvidia/pytorch:23.02-py3
+docker run --rm -it --gpus all -v ./:/lyraBaichuan nvcr.io/nvidia/pytorch:23.02-py3
+pip install -r requirements.txt
+python demo.py
+```
+## Uses
+```python
+from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
+model_path = "./models/Baichuan2-13B-lyra"
+tokenizer_path = "./models/Baichuan2-13B-lyra"
+inference_dtype = 'fp16'
+prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
+memopt_mode = 1
+max_output_length = 64
+arch = "Ampere" # Ampere or Volta
+cuda_version = 12 # cuda version, we currently support 11 and 12
+# To use 7B model, initialize with lyraBaichuan7B
+model = lyraBaichuan13B(model_path,
+                        tokenizer_path = tokenizer_path,
+                        dtype = inference_dtype,
+                        memopt_mode = memopt_mode,
+                        arch = arch,
+                        cuda_version = cuda_version)
+bs = 1
+prompts = [prompt, ] * bs
+output_texts = model.generate(
+        prompts, output_length=max_output_length,
+        top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
+print(output_texts)
+```
+## Demo Outputs
+### Baichuan2-13B-Base
+#### input
+登鹳雀楼->王之涣
+夜雨寄北->
+#### output
+## TODO
+1. Support for int4
+2. Inference for longer context situations
+3. Streaming inference mode.
+## Citation
+``` bibtex
+@Misc{lyraBaichuan2023,
+  author =       {Haoxiong Su, Kangjian Wu, Zhengtao Wang, Yibo Lu, Bin Wu},
+  title =        {lyraBaichuan: Accelerating Baichuan models to 4300+ tokens/s},
+  howpublished = {\url{https://huggingface.co/TMElyralab/lyraBaichuan}},
+  year =         {2023}
+}
+```
+## Report bug
+- start a discussion to report any bugs!--> https://huggingface.co/TMElyralab/lyraBaichuan
+- report bug with a `[bug]` mark in the title.

demo.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B
+model_path = "./models/Baichuan2-13B-lyra"
+tokenizer_path = "./models/Baichuan2-13B-lyra"
+inference_dtype = 'fp16'
+prompt = "登鹳雀楼->王之涣\n夜雨寄北->"
+memopt_mode = 1
+max_output_length = 64
+arch = "Ampere" # Ampere or Volta
+cuda_version = 12 # cuda version, we currently support 11 and 12
+model = lyraBaichuan13B(model_path,
+                        tokenizer_path = tokenizer_path,
+                        dtype = inference_dtype,
+                        memopt_mode = memopt_mode,
+                        arch = arch,
+                        cuda_version = cuda_version)
+bs = 1
+prompts = [prompt, ] * bs
+output_texts = model.generate(
+        prompts, output_length=max_output_length,
+        top_k=30, top_p=0.85, temperature=1.0, repetition_penalty=1.0, do_sample=False)
+print(output_texts)

lyra_baichuan/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .lyra_baichuan import lyraBaichuan7B, lyraBaichuan13B

lyra_baichuan/config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import dataclasses
+from typing import Optional
+@dataclasses.dataclass
+class LyraBaichuanParam:
+    num_heads: int = 40
+    size_per_head: int = 128
+    inter_size: int = 13824
+    num_layers: int = 40
+    vocab_size: int = 39424
+    start_id: Optional[int] = 1
+    end_id: Optional[int] = 2
+    tensor_para_size: int = 1
+    pipeline_para_size: int = 1
+    remove_padding: bool = True
+    shared_contexts_ratio: float = 1.0
+    layernorm_eps: float = 1e-6
+    weights_data_type: str = "fp16"
+    rotary_embedding: int = 128
+    use_gptj_residual: bool = False
+    def __post_init__(self):
+        if not 0.0 <= self.shared_contexts_ratio <= 1.0:
+            raise ValueError(
+                f'Got an invalid value of shared_context_ratio '
+                f'{self.shared_contexts_ratio} - range: [0.0, 1.0]')
+    def asdict(self):
+        return dataclasses.asdict(self)
+LYRA_BAICHUAN_PARAM = LyraBaichuanParam()
+LIB_SO_PATH = '/usr/lib/ftlib/libth_transformer.so'

lyra_baichuan/lyra_baichuan.py ADDED Viewed

	@@ -0,0 +1,367 @@

+from __future__ import annotations
+import configparser
+import pathlib
+import typing
+import os
+import torch
+import transformers
+from torch.nn.utils.rnn import pad_sequence
+from .config import LYRA_BAICHUAN_PARAM, LIB_SO_PATH
+from .model import BaichuanModel
+from .tokenization_baichuan import BaichuanTokenizer
+class lyraBaichuan7B:
+    def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.dtype = dtype
+        self.memopt_mode = memopt_mode
+        self.arch = arch
+        self.cuda_version = cuda_version
+        self.model, self.tokenizer = self.load_model_and_tokenizer()
+        print("Got model and tokenizer")
+    def load_model_and_tokenizer(self):
+        if self.tokenizer_path is None:
+            tokenizer_path = self.model_path
+        else:
+            tokenizer_path = self.tokenizer_path
+        print(f'Loading tokenizer from {tokenizer_path}')
+        tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
+        checkpoint_path = pathlib.Path(self.model_path)
+        config_path = checkpoint_path / 'config.ini'
+        if config_path.exists():
+            # Read model params from config.
+            cfg = configparser.ConfigParser()
+            cfg.read(config_path)
+            model_name = 'baichuan'
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = cfg.get(model_name, "weight_data_type")
+            model_args = dict(
+                head_num=cfg.getint(model_name, 'head_num'),
+                size_per_head=cfg.getint(model_name, "size_per_head"),
+                inter_size=cfg.getint(model_name, 'inter_size'),
+                layer_num=cfg.getint(model_name, "num_layer"),
+                rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
+                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
+                vocab_size=cfg.getint(model_name, "vocab_size"),
+                start_id=cfg.getint(model_name, "start_id"),
+                end_id=cfg.getint(model_name, "end_id"),
+                weights_data_type=cfg.get(model_name, "weight_data_type"),
+                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
+                inference_data_type=inference_data_type)
+        else:
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
+            model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
+                              size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
+                              inter_size=LYRA_BAICHUAN_PARAM.inter_size,
+                              layer_num=LYRA_BAICHUAN_PARAM.num_layers,
+                              rotary_embedding_dim=LYRA_BAICHUAN_PARAM.rotary_embedding,
+                              layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
+                              vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
+                              start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
+                              end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
+                              weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
+                              tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
+                              inference_data_type=inference_data_type)
+        # update common parameters
+        # Load the C++ model into Pytorch model.
+        sm = "sm80"
+        if self.arch == "Ampere":
+            sm = "sm80"
+        elif self.arch == "Volta":
+            sm = "sm70"
+        else:
+            raise Exception(f"unsupported arch: {self.arch}")
+        cu = 'cu11'
+        if self.cuda_version == 11:
+            cu = 'cu11'
+        elif self.cuda_version == 12:
+            cu = 'cu12'
+        else:
+            raise Exception(f"unsupported cuda version: {self.cuda_version}")
+        lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
+        model_args.update(dict(
+            lib_path=lib_path,
+            model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
+            max_seq_len=0,  # for position seq embedding
+            pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
+            use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
+            memopt_mode=self.memopt_mode
+        ))
+        print('[FT][INFO] Load Our FT Highly Optimized Baichuan-7B model')
+        for k, v in model_args.items():
+            print(f' - {k.ljust(25, ".")}: {v}')
+        # Check sanity and consistency between the model and tokenizer.
+        checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
+                     'tensor_para_size', 'tensor_para_size', 'weights_data_type']
+        if None in [model_args[k] for k in checklist]:
+            none_params = [p for p in checklist if model_args[p] is None]
+            print(f'[FT][WARNING] Found None parameters {none_params}. They must '
+                  f'be provided either by config file or CLI arguments.')
+        if model_args['start_id'] != tokenizer.bos_token_id:
+            print('[FT][WARNING] Given start_id is not matched with the bos token '
+                  'id of the pretrained tokenizer.')
+        if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
+            print('[FT][WARNING] Given end_id is not matched with neither pad '
+                  'token id nor eos token id of the pretrained tokenizer.')
+        print(f'Loading model from {self.model_path}')
+        model = BaichuanModel(**model_args)
+        return model, tokenizer
+    def generate(self, prompts: typing.List[str] | str,
+                 output_length: int = 512,
+                 beam_width: int = 1,
+                 top_k: typing.Optional[torch.IntTensor] = 1,
+                 top_p: typing.Optional[torch.FloatTensor] = 1.0,
+                 beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
+                 temperature: typing.Optional[torch.FloatTensor] = 1.0,
+                 len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
+                 repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
+                 presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                 min_length: typing.Optional[torch.IntTensor] = None,
+                 bad_words_list: typing.Optional[torch.IntTensor] = None,
+                 do_sample: bool = False,
+                 return_output_length: bool = False,
+                 return_cum_log_probs: int = 0):
+        #
+        if isinstance(prompts, str):
+            prompts = [prompts, ]
+        inputs = prompts
+        batch_size = len(inputs)
+        ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
+        ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        # we must encode the raw prompt text one by one in order to compute the length of the original text.
+        input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
+        input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
+        # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
+        input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
+        random_seed = None
+        if do_sample:
+            random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
+        outputs = self.model(start_ids=input_token_ids,
+                             start_lengths=input_lengths,
+                             output_len=output_length,
+                             beam_width=beam_width,
+                             top_k=top_k * ones_int,
+                             top_p=top_p * ones_float,
+                             beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
+                             temperature=temperature * ones_float,
+                             len_penalty=len_penalty * ones_float,
+                             repetition_penalty=repetition_penalty * ones_float,
+                             random_seed=random_seed,
+                             return_output_length=return_output_length,
+                             return_cum_log_probs=return_cum_log_probs)
+        if return_cum_log_probs > 0:
+            outputs = outputs[0]  # output_token_ids.
+        # Slice the generated token ids of the 1st beam result.
+        # output = input tokens + generated tokens.
+        output_token_ids = [out[0, length:].cpu()
+                            for out, length in zip(outputs, input_lengths)]
+        output_texts = self.tokenizer.batch_decode(
+            output_token_ids, skip_special_tokens=True)
+        return output_texts
+class lyraBaichuan13B:
+    def __init__(self, model_path, tokenizer_path=None, dtype='fp16', memopt_mode=0, arch='Ampere', cuda_version=12) -> None:
+        self.model_path = model_path
+        self.tokenizer_path = tokenizer_path
+        self.dtype = dtype
+        self.memopt_mode = memopt_mode
+        self.arch = arch
+        self.cuda_version = cuda_version
+        self.model, self.tokenizer = self.load_model_and_tokenizer()
+        print("Got model and tokenizer")
+    def load_model_and_tokenizer(self):
+        if self.tokenizer_path is None:
+            tokenizer_path = self.model_path
+        else:
+            tokenizer_path = self.tokenizer_path
+        print(f'Loading tokenizer from {tokenizer_path}')
+        tokenizer = BaichuanTokenizer.from_pretrained(tokenizer_path)
+        checkpoint_path = pathlib.Path(self.model_path)
+        config_path = checkpoint_path / 'config.ini'
+        if config_path.exists():
+            # Read model params from config.
+            cfg = configparser.ConfigParser()
+            cfg.read(config_path)
+            model_name = 'baichuan'
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = cfg.get(model_name, "weight_data_type")
+            model_args = dict(
+                head_num=cfg.getint(model_name, 'head_num'),
+                size_per_head=cfg.getint(model_name, "size_per_head"),
+                inter_size=cfg.getint(model_name, 'inter_size'),
+                layer_num=cfg.getint(model_name, "num_layer"),
+                rotary_embedding_dim=0,
+                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
+                vocab_size=cfg.getint(model_name, "vocab_size"),
+                start_id=cfg.getint(model_name, "start_id"),
+                end_id=cfg.getint(model_name, "end_id"),
+                weights_data_type=cfg.get(model_name, "weight_data_type"),
+                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
+                inference_data_type=inference_data_type)
+        else:
+            inference_data_type = self.dtype
+            if inference_data_type == None:
+                inference_data_type = LYRA_BAICHUAN_PARAM.weights_data_type
+            model_args = dict(head_num=LYRA_BAICHUAN_PARAM.num_heads,
+                              size_per_head=LYRA_BAICHUAN_PARAM.size_per_head,
+                              inter_size=LYRA_BAICHUAN_PARAM.inter_size,
+                              layer_num=LYRA_BAICHUAN_PARAM.num_layers,
+                              rotary_embedding_dim=0,
+                              layernorm_eps=LYRA_BAICHUAN_PARAM.layernorm_eps,
+                              vocab_size=LYRA_BAICHUAN_PARAM.vocab_size,
+                              start_id=LYRA_BAICHUAN_PARAM.start_id or tokenizer.bos_token_id,
+                              end_id=LYRA_BAICHUAN_PARAM.end_id or tokenizer.eos_token_id,
+                              weights_data_type=LYRA_BAICHUAN_PARAM.weights_data_type,
+                              tensor_para_size=LYRA_BAICHUAN_PARAM.tensor_para_size,
+                              inference_data_type=inference_data_type)
+        # update common parameters
+        # Load the C++ model into Pytorch model.
+        sm = "sm80"
+        if self.arch == "Ampere":
+            sm = "sm80"
+        elif self.arch == "Volta":
+            sm = "sm70"
+        else:
+            raise Exception(f"unsupported arch: {self.arch}")
+        cu = 'cu11'
+        if self.cuda_version == 11:
+            cu = 'cu11'
+        elif self.cuda_version == 12:
+            cu = 'cu12'
+        else:
+            raise Exception(f"unsupported cuda version: {self.cuda_version}")
+        lib_path = pathlib.Path(__file__).parent / "ftlib" / f"libth_transformer_{sm}_{cu}.so"
+        model_args.update(dict(
+            lib_path=lib_path,
+            model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
+            max_seq_len=0,  # for position seq embedding
+            pipeline_para_size=LYRA_BAICHUAN_PARAM.pipeline_para_size,
+            use_gptj_residual=LYRA_BAICHUAN_PARAM.use_gptj_residual,
+            memopt_mode=self.memopt_mode
+        ))
+        print('[FT][INFO] Load Our FT Highly Optimized Baichuan-13B model')
+        for k, v in model_args.items():
+            print(f' - {k.ljust(25, ".")}: {v}')
+        # Check sanity and consistency between the model and tokenizer.
+        checklist = ['head_num', 'size_per_head', 'vocab_size', 'layer_num',
+                     'tensor_para_size', 'tensor_para_size', 'weights_data_type']
+        if None in [model_args[k] for k in checklist]:
+            none_params = [p for p in checklist if model_args[p] is None]
+            print(f'[FT][WARNING] Found None parameters {none_params}. They must '
+                  f'be provided either by config file or CLI arguments.')
+        if model_args['start_id'] != tokenizer.bos_token_id:
+            print('[FT][WARNING] Given start_id is not matched with the bos token '
+                  'id of the pretrained tokenizer.')
+        if model_args['end_id'] not in (tokenizer.pad_token_id, tokenizer.eos_token_id):
+            print('[FT][WARNING] Given end_id is not matched with neither pad '
+                  'token id nor eos token id of the pretrained tokenizer.')
+        print(f'Loading model from {self.model_path}')
+        model = BaichuanModel(**model_args)
+        return model, tokenizer
+    def generate(self, prompts: typing.List[str] | str,
+                 output_length: int = 512,
+                 beam_width: int = 1,
+                 top_k: typing.Optional[torch.IntTensor] = 1,
+                 top_p: typing.Optional[torch.FloatTensor] = 1.0,
+                 beam_search_diversity_rate: typing.Optional[torch.FloatTensor] = 0.0,
+                 temperature: typing.Optional[torch.FloatTensor] = 1.0,
+                 len_penalty: typing.Optional[torch.FloatTensor] = 0.0,
+                 repetition_penalty: typing.Optional[torch.FloatTensor] = 1.0,
+                 presence_penalty: typing.Optional[torch.FloatTensor] = None,
+                 min_length: typing.Optional[torch.IntTensor] = None,
+                 bad_words_list: typing.Optional[torch.IntTensor] = None,
+                 do_sample: bool = False,
+                 return_output_length: bool = False,
+                 return_cum_log_probs: int = 0):
+        #
+        if isinstance(prompts, str):
+            prompts = [prompts, ]
+        inputs = prompts
+        batch_size = len(inputs)
+        ones_int = torch.ones(size=[batch_size], dtype=torch.int32)
+        ones_float = torch.ones(size=[batch_size], dtype=torch.float32)
+        # we must encode the raw prompt text one by one in order to compute the length of the original text.
+        input_token_ids = [self.tokenizer(text, return_tensors="pt").input_ids.int().squeeze() for text in inputs]
+        input_lengths = torch.IntTensor([len(ids) for ids in input_token_ids])
+        # after got the length of each input text tokens. we can batchfy the input list to a tensor. padding the right.
+        input_token_ids = pad_sequence(input_token_ids, batch_first=True, padding_value=self.tokenizer.eos_token_id)
+        random_seed = None
+        if do_sample:
+            random_seed = torch.randint(0, 262144, (batch_size,), dtype=torch.long)
+        outputs = self.model(start_ids=input_token_ids,
+                             start_lengths=input_lengths,
+                             output_len=output_length,
+                             beam_width=beam_width,
+                             top_k=top_k * ones_int,
+                             top_p=top_p * ones_float,
+                             beam_search_diversity_rate=beam_search_diversity_rate * ones_float,
+                             temperature=temperature * ones_float,
+                             len_penalty=len_penalty * ones_float,
+                             repetition_penalty=repetition_penalty * ones_float,
+                             random_seed=random_seed,
+                             return_output_length=return_output_length,
+                             return_cum_log_probs=return_cum_log_probs)
+        if return_cum_log_probs > 0:
+            outputs = outputs[0]  # output_token_ids.
+        # Slice the generated token ids of the 1st beam result.
+        # output = input tokens + generated tokens.
+        output_token_ids = [out[0, length:].cpu()
+                            for out, length in zip(outputs, input_lengths)]
+        output_texts = self.tokenizer.batch_decode(
+            output_token_ids, skip_special_tokens=True)
+        return output_texts

lyra_baichuan/model.py ADDED Viewed

	@@ -0,0 +1,166 @@

+# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import copy
+import os
+import pathlib
+import typing
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+class BaichuanModel(nn.Module):
+    def __init__(self,
+                 head_num,
+                 size_per_head,
+                 inter_size,
+                 vocab_size,
+                 rotary_embedding_dim,
+                 start_id, end_id, layer_num,
+                 max_seq_len: int,
+                 layernorm_eps,
+                 tensor_para_size: int,
+                 pipeline_para_size: int,
+                 use_gptj_residual,
+                 lib_path: typing.Union[str, pathlib.Path],
+                 model_path,
+                 memopt_mode: int = 0,
+                 inference_data_type: str = "fp16",
+                 weights_data_type: typing.Union[str, np.dtype] = np.float32):
+        super().__init__()
+        self.head_num = head_num
+        self.size_per_head = size_per_head
+        self.inter_size = inter_size
+        self.vocab_size = vocab_size
+        self.rotary_embedding_dim = rotary_embedding_dim
+        self.start_id = start_id
+        self.end_id = end_id
+        self.max_seq_len = max_seq_len
+        self.layer_num = layer_num
+        self.use_gptj_residual = use_gptj_residual
+        self.layernorm_eps = layernorm_eps
+        self.memopt_mode = memopt_mode
+        # multi-gpu params
+        self.tensor_para_size = tensor_para_size
+        self.pipeline_para_size = pipeline_para_size
+        self.build_model = False
+        self.weights_data_type = weights_data_type
+        self.inference_data_type = inference_data_type
+        assert torch.cuda.is_available(), "CUDA is required for this model."
+        assert head_num % tensor_para_size == 0, "head_num must be a multiple of tensor_para_size."
+        assert layer_num % pipeline_para_size == 0, "layer_num must be a multiple of pipeline_para_size."
+        # Load the C++ model into Pytorch model.
+        torch.classes.load_library(os.path.abspath(lib_path))
+        # Prepare for tensor/pipeline parallel
+        try:
+            dist.init_process_group(backend='mpi')
+        except:
+            print("[INFO] WARNING: Have initialized the process group")
+        self.rank = dist.get_rank()
+        self.device_count = torch.cuda.device_count()
+        self.device = self.rank % self.device_count
+        torch.cuda.set_device(self.device)
+        world_size = dist.get_world_size()
+        # print(tensor_para_size * pipeline_para_size)
+        assert world_size == tensor_para_size * pipeline_para_size, "tensor_para_size * pipeline_para_size must be equal to world_size."
+        self.tensor_para_rank = self.rank % self.tensor_para_size
+        self.pipeline_para_rank = self.rank // self.tensor_para_size
+        self.model = torch.classes.FasterTransformer.BaichuanOp(
+            self.head_num, self.size_per_head, self.inter_size,
+            self.layer_num,
+            self.vocab_size,
+            self.rotary_embedding_dim,
+            self.layernorm_eps,
+            self.start_id, self.end_id,
+            self.tensor_para_size, self.pipeline_para_size,
+            self.max_seq_len,
+            self.use_gptj_residual,
+            self.memopt_mode,
+            model_path,
+            self.weights_data_type,
+            self.inference_data_type)
+        self.build_model = True
+        torch.cuda.empty_cache()
+    def forward(self,
+                start_ids: torch.Tensor,
+                start_lengths: torch.Tensor,
+                output_len,
+                beam_width=1,
+                top_k: torch.Tensor = None,
+                top_p: torch.Tensor = None,
+                beam_search_diversity_rate: torch.Tensor = None,
+                temperature: torch.Tensor = None,
+                len_penalty: torch.Tensor = None,
+                repetition_penalty: torch.Tensor = None,
+                random_seed: torch.Tensor = None,
+                return_output_length=False,
+                return_cum_log_probs=0):
+        input_len = start_ids.size(1)
+        assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
+        # Inputs to device
+        input_ids = start_ids.cuda(self.device)
+        input_lengths = start_lengths.cuda(self.device)
+        # outputs: output_ids, output_lengths, output_cum_log_probs (optional)
+        outputs = self.model.forward(input_ids,
+                                     input_lengths,
+                                     output_len,
+                                     beam_width,  # optional, can be None
+                                     top_k,  # optional, can be None
+                                     top_p,  # optional, can be None
+                                     beam_search_diversity_rate,  # optional, can be None
+                                     temperature,  # optional, can be None
+                                     len_penalty,  # optional, can be None
+                                     repetition_penalty,  # optional, can be None
+                                     random_seed,  # optional, can be None
+                                     return_cum_log_probs)  # optional, can be None
+        if return_cum_log_probs == 0:
+            output_ids, output_lengths = outputs
+        else:
+            output_ids, output_lengths, output_cum_log_probs = outputs
+        if return_output_length:
+            if return_cum_log_probs > 0:
+                return output_ids, output_lengths, output_cum_log_probs
+            else:
+                return output_ids, output_lengths
+        else:
+            return output_ids
+    def set_input_tensor(self, input_tensor):
+        """Set input tensor to be used instead of forward()'s input.
+        When doing pipeline parallelism the input from the previous
+        stage comes from communication, not from the input, so the
+        model's forward_step_func won't have it. This function is thus
+        used by internal code to bypass the input provided by the
+        forward_step_func"""
+        self.input_tensor = input_tensor

lyra_baichuan/tokenization_baichuan.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {},
+    "tokenizer_file": {},
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+class BaichuanTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+transformers
+numpy
+setuptools
+torch
+bfloat16
+sentencepiece