zai-org
/

chatglm-6b

@@ -1,8 +1,15 @@
 """ ChatGLM model configuration """
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
@@ -101,3 +108,306 @@ class ChatGLMConfig(PretrainedConfig):
             eos_token_id=eos_token_id,
             **kwargs
         )

 """ ChatGLM model configuration """
+import torch
+from collections import OrderedDict
+from typing import List, Mapping, Optional, Any
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
+from transformers.onnx import OnnxConfigWithPast, PatchingSpec
+from transformers import PreTrainedTokenizer, TensorType, is_torch_available
 logger = logging.get_logger(__name__)
             eos_token_id=eos_token_id,
             **kwargs
         )
+class ChatGLMOnnxConfig(OnnxConfigWithPast):
+    r"""
+    This class is the custom configuration of a ChatGLMModel needed in exporting model to ONNX.
+    Currently this need to pre-fix several model struct in modeling_chatglm.py
+    Also there is still a TODO list of current ChatGLMOnnxConfig:
+    1. add support for batch_size > 1
+    2. add support for use_past
+    in modeling_chatglm.py and its attention_fn function,we need to change several view into
+    torch tensor action since reshape param may get frozen into constant in onnx model.
+    here is the code:
+    ```python
+    >>> def attention_fn(
+    >>>         self,
+    >>>         query_layer,
+    >>>         key_layer,
+    >>>         value_layer,
+    >>>         attention_mask,
+    >>>         hidden_size_per_partition,
+    >>>         layer_id,
+    >>>         layer_past=None,
+    >>>         scaling_attention_score=True,
+    >>>         use_cache=False,
+    >>> ):
+    >>>     if layer_past is not None:
+    >>>         past_key, past_value = layer_past[0], layer_past[1]
+    >>>         key_layer = torch.cat((past_key, key_layer), dim=0)
+    >>>         value_layer = torch.cat((past_value, value_layer), dim=0)
+    >>>
+    >>>     # seqlen, batch, num_attention_heads, hidden_size_per_attention_head
+    >>>     seq_len, b, nh, hidden_size = key_layer.shape
+    >>>
+    >>>     if use_cache:
+    >>>         present = (key_layer, value_layer)
+    >>>     else:
+    >>>         present = None
+    >>>
+    >>>     query_key_layer_scaling_coeff = float(layer_id + 1)
+    >>>     if scaling_attention_score:
+    >>>         query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
+    >>>
+    >>>     # ===================================
+    >>>     # Raw attention scores. [b, np, s, s]
+    >>>     # ===================================
+    >>>
+    >>>     # [b, np, sq, sk]
+    >>>     # # output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+    >>>
+    >>>     # [sq, b, np, hn] -> [sq, b * np, hn]
+    >>>     # query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+    >>>     query_layer = query_layer.flatten(start_dim=1, end_dim=2)
+    >>>
+    >>>     # [sk, b, np, hn] -> [sk, b * np, hn]
+    >>>     # key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+    >>>     key_layer = key_layer.flatten(start_dim=1, end_dim=2)
+    >>>
+    >>>     matmul_result = torch.zeros(
+    >>>         1, 1, 1,
+    >>>         dtype=query_layer.dtype,
+    >>>         device=query_layer.device,
+    >>>     )
+    >>>
+    >>>     matmul_result = torch.baddbmm(
+    >>>         matmul_result,
+    >>>         query_layer.transpose(0, 1),  # [b * np, sq, hn]
+    >>>         key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+    >>>         beta=0.0,
+    >>>         alpha=1.0,
+    >>>     )
+    >>>
+    >>>     # [b * np, sq, sk] -> [b, np, sq, sk]
+    >>>     # attention_scores = matmul_result.view(*output_size)
+    >>>     attention_scores = matmul_result.unsqueeze(0)
+    >>>
+    >>>     if self.scale_mask_softmax:
+    >>>         self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
+    >>>         attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
+    >>>     else:
+    >>>         # if not (attention_mask == 0).all():
+    >>>         #     # if auto-regressive, skip
+    >>>         attention_scores.masked_fill_(attention_mask, -10000.0)
+    >>>         dtype = attention_scores.dtype
+    >>>         attention_scores = attention_scores.float()
+    >>>         attention_scores = attention_scores * query_key_layer_scaling_coeff
+    >>>
+    >>>         attention_probs = F.softmax(attention_scores, dim=-1)
+    >>>
+    >>>         attention_probs = attention_probs.type(dtype)
+    >>>
+    >>>     # =========================
+    >>>     # Context layer. [sq, b, hp]
+    >>>     # =========================
+    >>>
+    >>>     # value_layer -> context layer.
+    >>>     # [sk, b, np, hn] --> [b, np, sq, hn]
+    >>>
+    >>>     # context layer shape: [b, np, sq, hn]
+    >>>     # output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+    >>>
+    >>>     # change view [sk, b * np, hn]
+    >>>     # value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+    >>>     value_layer = value_layer.flatten(start_dim=1, end_dim=2)
+    >>>
+    >>>     # change view [b * np, sq, sk]
+    >>>     # attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+    >>>     attention_probs = attention_probs.flatten(start_dim=0, end_dim=1)
+    >>>
+    >>>     # matmul: [b * np, sq, hn]
+    >>>     context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+    >>>
+    >>>     # change view [b, np, sq, hn]
+    >>>     # context_layer = context_layer.reshape(b, np, sq, hidden_size)
+    >>>     context_layer = context_layer.unsqueeze(0)
+    >>>
+    >>>     # [b, np, sq, hn] --> [sq, b, np, hn]
+    >>>     context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+    >>>
+    >>>     # [sq, b, np, hn] --> [sq, b, hp]
+    >>>     # new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
+    >>>     # context_layer = context_layer.view(*new_context_layer_shape)
+    >>>     context_layer = context_layer.flatten(start_dim=2)
+    >>>
+    >>>     outputs = (context_layer, present, attention_probs)
+    >>>
+    >>>     return outputs
+    '''
+    mainly aviod using view with dynamic size
+    after change the modeling_chatglm.py, you can simply use following code to export and test the onnx model
+    ```python
+    >>> from pathlib import Path
+    >>> from transformers import AutoTokenizer, AutoModel
+    >>> from transformers.onnx import export, validate_model_outputs
+    >>>
+    >>> # load model
+    >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    >>> pt_model = AutoModel.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
+    >>> pt_model = pt_model.float()  # only tested in CPU for now
+    >>> pt_model.eval()
+    >>> # define path for saving onnx model
+    >>> onnx_path = Path(f"model/chatglm-6b.onnx")
+    >>> onnx_path.parent.mkdir(exist_ok=True)
+    >>> # convert model to onnx
+    >>> onnx_config_chatglm = ChatGLMOnnxConfig(pt_model.config, task="causal-lm")
+    >>> onnx_inputs, onnx_outputs = export(tokenizer, pt_model,
+    >>>                                    onnx_config_chatglm, onnx_config_chatglm.default_onnx_opset,
+    >>>                                    onnx_path)
+    >>> # test onnx model
+    >>> validate_model_outputs(onnx_config_chatglm, tokenizer, pt_model, onnx_path, onnx_outputs, atol=1e-4)
+    ```
+    """
+    # TODO support dynamic batch size
+    default_fixed_batch = 1
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: List[PatchingSpec] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            # TODO support use_past
+            # self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            # common_inputs["attention_mask"] = \
+            #     {0: "batch", 1: "past_sequence + sequence", 2: "past_sequence + sequence"}
+            raise NotImplementedError('position_ids do not support past_key_values yet.')
+        else:
+            # remind the order
+            common_inputs["position_ids"] = {0: "batch", 2: "sequence"}
+            common_inputs["attention_mask"] = {0: "batch", 2: "sequence", 3: "sequence"}
+        return common_inputs
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+    def get_masks(self, input_ids, device=None):
+        """
+        reference from modeling_chatglm.get_masks
+        """
+        batch_size, seq_length = input_ids.shape
+        context_lengths = [seq.tolist().index(self._config.bos_token_id) for seq in input_ids]
+        if device:
+            attention_mask = torch.ones((batch_size, seq_length, seq_length), device=device)
+        else:
+            attention_mask = torch.ones((batch_size, seq_length, seq_length), device=input_ids.device)
+        attention_mask.tril_()
+        for i, context_length in enumerate(context_lengths):
+            attention_mask[i, :, :context_length] = 1
+        attention_mask.unsqueeze_(1)
+        attention_mask = (attention_mask < 0.5).bool()
+        # print("attention_mask", attention_mask.shape)
+        return attention_mask
+    def get_position_ids(self, input_ids, mask_positions, device=None, use_gmasks=None):
+        batch_size, seq_length = input_ids.shape
+        if device is None:
+            device = input_ids.device
+        if use_gmasks is None:
+            use_gmasks = [False] * batch_size
+        context_lengths = [seq.tolist().index(self._config.bos_token_id) for seq in input_ids]
+        if self._config.position_encoding_2d:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                position_ids[i, context_length:] = mask_positions[i]
+            block_position_ids = [torch.cat((
+                torch.zeros(context_length, dtype=torch.long, device=device),
+                torch.arange(seq_length - context_length, dtype=torch.long, device=device) + 1
+            )) for context_length in context_lengths]
+            block_position_ids = torch.stack(block_position_ids, dim=0)
+            position_ids = torch.stack((position_ids, block_position_ids), dim=1)
+        else:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+            for i, context_length in enumerate(context_lengths):
+                if not use_gmasks[i]:
+                    position_ids[context_length:] = mask_positions[i]
+        # print("position_ids", position_ids.shape)
+        return position_ids
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = default_fixed_batch,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=self.default_fixed_batch, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+        # check if the mode is using fixed batch size
+        if batch_size != self.default_fixed_batch:
+            logger.warning('batch size is not fixed, force change into fixed batch size: %d.'
+                           % self.default_fixed_batch)
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                # TODO support use_past
+                # import torch
+                #
+                # batch, seqlen = common_inputs["input_ids"].shape
+                # # Not using the same length for past_key_values
+                # past_key_values_length = seqlen + 2
+                # past_shape = (
+                #     batch,
+                #     self.num_attention_heads,
+                #     past_key_values_length,
+                #     self._config.hidden_size // self.num_attention_heads,
+                # )
+                # ordered_inputs["past_key_values"] = [
+                #     (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                # ]
+                raise NotImplementedError('position_ids do not support past_key_values yet.')
+        # Need to add the attention_mask manually
+        # 1. add attention_mask
+        ordered_inputs["attention_mask"] = self.get_masks(common_inputs["input_ids"])
+        # 2. add position_ids
+        MASK, gMASK = self._config.mask_token_id, self._config.gmask_token_id
+        seqs = common_inputs["input_ids"].tolist()
+        mask_positions, use_gmasks = [], []
+        for seq in seqs:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            mask_positions.append(seq.index(mask_token))
+            use_gmasks.append(use_gmask)
+        ordered_inputs["position_ids"] = self.get_position_ids(common_inputs["input_ids"],
+                                                               mask_positions, use_gmasks=use_gmasks)
+        if self.use_past:
+            # mask_dtype = ordered_inputs["attention_mask"].dtype
+            # ordered_inputs["attention_mask"] = torch.cat(
+            #     [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            # )
+            raise NotImplementedError('position_ids do not support past_key_values yet.')
+        return ordered_inputs
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13