Commit ·
f243561
1
Parent(s): 4a9b711
support multi gpu and remove some bug
Browse files1. 支持多GPU训练,之前是不支持的(主要做的是修改了部分变量的device id)
2. 在多卡并行的时候,会遇到(TypeError: 'NoneType' object is not subscriptable)。主要是因为`RotaryEmbedding`的`forward`部分。
- modeling_chatglm.py +252 -307
modeling_chatglm.py
CHANGED
|
@@ -3,8 +3,6 @@
|
|
| 3 |
import math
|
| 4 |
import copy
|
| 5 |
import os
|
| 6 |
-
import warnings
|
| 7 |
-
import re
|
| 8 |
import sys
|
| 9 |
|
| 10 |
import torch
|
|
@@ -13,7 +11,7 @@ import torch.nn.functional as F
|
|
| 13 |
from torch import nn
|
| 14 |
from torch.nn import CrossEntropyLoss, LayerNorm
|
| 15 |
from torch.nn.utils import skip_init
|
| 16 |
-
from typing import Optional, Tuple, Union, List
|
| 17 |
|
| 18 |
from transformers.utils import (
|
| 19 |
add_code_sample_docstrings,
|
|
@@ -26,20 +24,17 @@ from transformers.modeling_outputs import (
|
|
| 26 |
BaseModelOutputWithPastAndCrossAttentions,
|
| 27 |
)
|
| 28 |
from transformers.modeling_utils import PreTrainedModel
|
| 29 |
-
from transformers.utils import logging
|
| 30 |
-
from transformers.generation.logits_process import LogitsProcessor
|
| 31 |
-
from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig
|
| 32 |
|
|
|
|
| 33 |
from .configuration_chatglm import ChatGLMConfig
|
| 34 |
|
| 35 |
-
# flags required to enable jit fusion kernels
|
| 36 |
-
|
| 37 |
if sys.platform != 'darwin':
|
| 38 |
torch._C._jit_set_profiling_mode(False)
|
| 39 |
torch._C._jit_set_profiling_executor(False)
|
| 40 |
torch._C._jit_override_can_fuse_on_cpu(True)
|
| 41 |
torch._C._jit_override_can_fuse_on_gpu(True)
|
| 42 |
|
|
|
|
| 43 |
logger = logging.get_logger(__name__)
|
| 44 |
|
| 45 |
_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
|
|
@@ -51,14 +46,6 @@ CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
|
| 51 |
]
|
| 52 |
|
| 53 |
|
| 54 |
-
class InvalidScoreLogitsProcessor(LogitsProcessor):
|
| 55 |
-
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
|
| 56 |
-
if torch.isnan(scores).any() or torch.isinf(scores).any():
|
| 57 |
-
scores.zero_()
|
| 58 |
-
scores[..., 20005] = 5e4
|
| 59 |
-
return scores
|
| 60 |
-
|
| 61 |
-
|
| 62 |
def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
|
| 63 |
"""Load tf checkpoints in a pytorch model."""
|
| 64 |
try:
|
|
@@ -153,7 +140,7 @@ class RotaryEmbedding(torch.nn.Module):
|
|
| 153 |
if learnable:
|
| 154 |
self.inv_freq = torch.nn.Parameter(inv_freq)
|
| 155 |
self.max_seq_len_cached = None
|
| 156 |
-
else:
|
| 157 |
self.register_buffer('inv_freq', inv_freq)
|
| 158 |
self.max_seq_len_cached = None
|
| 159 |
self.cos_cached = None
|
|
@@ -169,22 +156,24 @@ class RotaryEmbedding(torch.nn.Module):
|
|
| 169 |
seq_len = x.shape[seq_dim]
|
| 170 |
if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
|
| 171 |
self.max_seq_len_cached = None if self.learnable else seq_len
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
| 188 |
return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
|
| 189 |
|
| 190 |
|
|
@@ -202,114 +191,6 @@ def apply_rotary_pos_emb_index(q, k, cos, sin, position_id):
|
|
| 202 |
return q, k
|
| 203 |
|
| 204 |
|
| 205 |
-
def attention_fn(
|
| 206 |
-
self,
|
| 207 |
-
query_layer,
|
| 208 |
-
key_layer,
|
| 209 |
-
value_layer,
|
| 210 |
-
attention_mask,
|
| 211 |
-
hidden_size_per_partition,
|
| 212 |
-
layer_id,
|
| 213 |
-
layer_past=None,
|
| 214 |
-
scaling_attention_score=True,
|
| 215 |
-
use_cache=False,
|
| 216 |
-
):
|
| 217 |
-
if layer_past is not None:
|
| 218 |
-
past_key, past_value = layer_past
|
| 219 |
-
key_layer = torch.cat((past_key, key_layer), dim=0)
|
| 220 |
-
value_layer = torch.cat((past_value, value_layer), dim=0)
|
| 221 |
-
|
| 222 |
-
# seqlen, batch, num_attention_heads, hidden_size_per_attention_head
|
| 223 |
-
seq_len, b, nh, hidden_size = key_layer.shape
|
| 224 |
-
|
| 225 |
-
if use_cache:
|
| 226 |
-
present = (key_layer, value_layer)
|
| 227 |
-
else:
|
| 228 |
-
present = None
|
| 229 |
-
|
| 230 |
-
query_key_layer_scaling_coeff = float(layer_id + 1)
|
| 231 |
-
if scaling_attention_score:
|
| 232 |
-
query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
|
| 233 |
-
|
| 234 |
-
# ===================================
|
| 235 |
-
# Raw attention scores. [b, np, s, s]
|
| 236 |
-
# ===================================
|
| 237 |
-
|
| 238 |
-
# [b, np, sq, sk]
|
| 239 |
-
output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
|
| 240 |
-
|
| 241 |
-
# [sq, b, np, hn] -> [sq, b * np, hn]
|
| 242 |
-
query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
|
| 243 |
-
# [sk, b, np, hn] -> [sk, b * np, hn]
|
| 244 |
-
key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
|
| 245 |
-
|
| 246 |
-
matmul_result = torch.empty(
|
| 247 |
-
output_size[0] * output_size[1],
|
| 248 |
-
output_size[2],
|
| 249 |
-
output_size[3],
|
| 250 |
-
dtype=query_layer.dtype,
|
| 251 |
-
device=query_layer.device,
|
| 252 |
-
)
|
| 253 |
-
|
| 254 |
-
matmul_result = torch.baddbmm(
|
| 255 |
-
matmul_result,
|
| 256 |
-
query_layer.transpose(0, 1), # [b * np, sq, hn]
|
| 257 |
-
key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk]
|
| 258 |
-
beta=0.0,
|
| 259 |
-
alpha=1.0,
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
# change view to [b, np, sq, sk]
|
| 263 |
-
attention_scores = matmul_result.view(*output_size)
|
| 264 |
-
|
| 265 |
-
if self.scale_mask_softmax:
|
| 266 |
-
self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
|
| 267 |
-
attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
|
| 268 |
-
else:
|
| 269 |
-
if not (attention_mask == 0).all():
|
| 270 |
-
# if auto-regressive, skip
|
| 271 |
-
attention_scores.masked_fill_(attention_mask, -10000.0)
|
| 272 |
-
dtype = attention_scores.dtype
|
| 273 |
-
attention_scores = attention_scores.float()
|
| 274 |
-
attention_scores = attention_scores * query_key_layer_scaling_coeff
|
| 275 |
-
|
| 276 |
-
attention_probs = F.softmax(attention_scores, dim=-1)
|
| 277 |
-
|
| 278 |
-
attention_probs = attention_probs.type(dtype)
|
| 279 |
-
|
| 280 |
-
# =========================
|
| 281 |
-
# Context layer. [sq, b, hp]
|
| 282 |
-
# =========================
|
| 283 |
-
|
| 284 |
-
# value_layer -> context layer.
|
| 285 |
-
# [sk, b, np, hn] --> [b, np, sq, hn]
|
| 286 |
-
|
| 287 |
-
# context layer shape: [b, np, sq, hn]
|
| 288 |
-
output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
|
| 289 |
-
|
| 290 |
-
# change view [sk, b * np, hn]
|
| 291 |
-
value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
|
| 292 |
-
|
| 293 |
-
# change view [b * np, sq, sk]
|
| 294 |
-
attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
|
| 295 |
-
|
| 296 |
-
# matmul: [b * np, sq, hn]
|
| 297 |
-
context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
|
| 298 |
-
|
| 299 |
-
# change view [b, np, sq, hn]
|
| 300 |
-
context_layer = context_layer.view(*output_size)
|
| 301 |
-
|
| 302 |
-
# [b, np, sq, hn] --> [sq, b, np, hn]
|
| 303 |
-
context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
|
| 304 |
-
|
| 305 |
-
# [sq, b, np, hn] --> [sq, b, hp]
|
| 306 |
-
new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
|
| 307 |
-
context_layer = context_layer.view(*new_context_layer_shape)
|
| 308 |
-
|
| 309 |
-
outputs = (context_layer, present, attention_probs)
|
| 310 |
-
|
| 311 |
-
return outputs
|
| 312 |
-
|
| 313 |
|
| 314 |
class SelfAttention(torch.nn.Module):
|
| 315 |
def __init__(self, hidden_size, num_attention_heads,
|
|
@@ -399,7 +280,7 @@ class SelfAttention(torch.nn.Module):
|
|
| 399 |
"""
|
| 400 |
|
| 401 |
# [seq_len, batch, 3 * hidden_size]
|
| 402 |
-
mixed_raw_layer = self.query_key_value(hidden_states)
|
| 403 |
|
| 404 |
# [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
|
| 405 |
new_tensor_shape = mixed_raw_layer.size()[:-1] + (
|
|
@@ -414,6 +295,7 @@ class SelfAttention(torch.nn.Module):
|
|
| 414 |
if self.position_encoding_2d:
|
| 415 |
q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
|
| 416 |
k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
|
|
|
|
| 417 |
cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
|
| 418 |
position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
|
| 419 |
position_ids[:, 1, :].transpose(0, 1).contiguous()
|
|
@@ -423,22 +305,25 @@ class SelfAttention(torch.nn.Module):
|
|
| 423 |
key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
|
| 424 |
else:
|
| 425 |
position_ids = position_ids.transpose(0, 1)
|
|
|
|
| 426 |
cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
|
| 427 |
# [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
|
| 428 |
query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
|
| 429 |
|
| 430 |
# [seq_len, batch, hidden_size]
|
| 431 |
-
context_layer, present, attention_probs = attention_fn(
|
| 432 |
-
self=self,
|
| 433 |
query_layer=query_layer,
|
| 434 |
key_layer=key_layer,
|
| 435 |
value_layer=value_layer,
|
| 436 |
-
attention_mask=attention_mask,
|
| 437 |
hidden_size_per_partition=self.hidden_size_per_partition,
|
| 438 |
layer_id=layer_id,
|
| 439 |
layer_past=layer_past,
|
| 440 |
use_cache=use_cache
|
| 441 |
)
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
output = self.dense(context_layer)
|
| 444 |
|
|
@@ -449,6 +334,118 @@ class SelfAttention(torch.nn.Module):
|
|
| 449 |
|
| 450 |
return outputs # output, present, attention_probs
|
| 451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 452 |
|
| 453 |
class GEGLU(torch.nn.Module):
|
| 454 |
def __init__(self):
|
|
@@ -614,7 +611,8 @@ class ChatGLMPreTrainedModel(PreTrainedModel):
|
|
| 614 |
a simple interface for downloading and loading pretrained models.
|
| 615 |
"""
|
| 616 |
|
| 617 |
-
is_parallelizable =
|
|
|
|
| 618 |
supports_gradient_checkpointing = False
|
| 619 |
config_class = ChatGLMConfig
|
| 620 |
base_model_prefix = "transformer"
|
|
@@ -724,6 +722,8 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 724 |
self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
|
| 725 |
self.position_encoding_2d = config.position_encoding_2d
|
| 726 |
|
|
|
|
|
|
|
| 727 |
self.word_embeddings = skip_init(
|
| 728 |
torch.nn.Embedding,
|
| 729 |
num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
|
|
@@ -757,8 +757,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 757 |
def set_input_embeddings(self, new_embeddings: torch.Tensor):
|
| 758 |
self.word_embeddings = new_embeddings
|
| 759 |
|
| 760 |
-
|
| 761 |
-
|
|
|
|
| 762 |
|
| 763 |
attention_mask = torch.ones((1, len(seq), len(seq)), device=device)
|
| 764 |
attention_mask.tril_()
|
|
@@ -769,9 +770,9 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 769 |
return attention_mask
|
| 770 |
|
| 771 |
def get_position_ids(self, seq, mask_position, device, gmask=False):
|
| 772 |
-
context_length = seq.index(
|
| 773 |
if self.position_encoding_2d:
|
| 774 |
-
seq_length = seq.index(
|
| 775 |
position_ids = torch.arange(context_length, dtype=torch.long, device=device)
|
| 776 |
if not gmask:
|
| 777 |
position_ids[seq_length:] = mask_position
|
|
@@ -826,8 +827,14 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 826 |
|
| 827 |
if past_key_values is None:
|
| 828 |
past_key_values = tuple([None] * len(self.layers))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
seq = input_ids[0].tolist()
|
| 830 |
|
|
|
|
|
|
|
| 831 |
if attention_mask is None:
|
| 832 |
attention_mask = self.get_masks(
|
| 833 |
seq=seq,
|
|
@@ -835,11 +842,6 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 835 |
)
|
| 836 |
|
| 837 |
if position_ids is None:
|
| 838 |
-
MASK, gMASK = 150000, 150001
|
| 839 |
-
mask_token = MASK if MASK in input_ids else gMASK
|
| 840 |
-
use_gmask = False if MASK in input_ids else gMASK
|
| 841 |
-
|
| 842 |
-
mask_position = seq.index(mask_token)
|
| 843 |
position_ids = self.get_position_ids(
|
| 844 |
seq=seq,
|
| 845 |
mask_position=mask_position,
|
|
@@ -848,15 +850,28 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 848 |
)
|
| 849 |
|
| 850 |
if inputs_embeds is None:
|
| 851 |
-
|
|
|
|
|
|
|
|
|
|
| 852 |
|
| 853 |
# [seq_len, batch, hidden_size]
|
| 854 |
hidden_states = inputs_embeds.transpose(0, 1)
|
| 855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 856 |
presents = () if use_cache else None
|
| 857 |
all_self_attentions = () if output_attentions else None
|
| 858 |
all_hidden_states = () if output_hidden_states else None
|
| 859 |
|
|
|
|
|
|
|
| 860 |
seq_length_with_past = seq_length
|
| 861 |
past_key_values_length = 0
|
| 862 |
if past_key_values[0] is not None:
|
|
@@ -873,15 +888,39 @@ class ChatGLMModel(ChatGLMPreTrainedModel):
|
|
| 873 |
if output_hidden_states:
|
| 874 |
all_hidden_states = all_hidden_states + (hidden_states,)
|
| 875 |
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 885 |
|
| 886 |
hidden_states = layer_ret[0]
|
| 887 |
|
|
@@ -928,6 +967,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
| 928 |
bias=False,
|
| 929 |
dtype=torch.half
|
| 930 |
)
|
|
|
|
| 931 |
|
| 932 |
def get_output_embeddings(self):
|
| 933 |
return self.lm_head
|
|
@@ -943,7 +983,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
| 943 |
attention_mask = (attention_mask < 0.5).bool()
|
| 944 |
|
| 945 |
if self.position_encoding_2d:
|
| 946 |
-
seq_length = seq.index(
|
| 947 |
position_ids = torch.arange(context_length, dtype=torch.long, device=device)
|
| 948 |
if not gmask:
|
| 949 |
position_ids[seq_length:] = mask_position
|
|
@@ -981,7 +1021,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
| 981 |
|
| 982 |
# only last token for input_ids if past is not None
|
| 983 |
if past is not None or past_key_values is not None:
|
| 984 |
-
context_length = seq.index(
|
| 985 |
last_token = input_ids[:, -1].unsqueeze(-1)
|
| 986 |
if self.position_encoding_2d:
|
| 987 |
position_ids = torch.tensor([[[mask_position], [len(seq) - context_length]]], dtype=torch.long,
|
|
@@ -1053,10 +1093,8 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
| 1053 |
shift_labels = labels[..., 1:].contiguous()
|
| 1054 |
# Flatten the tokens
|
| 1055 |
loss_fct = CrossEntropyLoss()
|
| 1056 |
-
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
| 1057 |
|
| 1058 |
-
lm_logits = lm_logits.to(hidden_states.dtype)
|
| 1059 |
-
loss = loss.to(hidden_states.dtype)
|
| 1060 |
|
| 1061 |
if not return_dict:
|
| 1062 |
output = (lm_logits,) + transformer_outputs[1:]
|
|
@@ -1089,31 +1127,13 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
| 1089 |
for layer_past in past
|
| 1090 |
)
|
| 1091 |
|
| 1092 |
-
def process_response(self, response):
|
| 1093 |
-
response = response.strip()
|
| 1094 |
-
response = response.replace("[[训练时间]]", "2023年")
|
| 1095 |
-
punkts = [
|
| 1096 |
-
[",", ","],
|
| 1097 |
-
["!", "!"],
|
| 1098 |
-
[":", ":"],
|
| 1099 |
-
[";", ";"],
|
| 1100 |
-
["\?", "?"],
|
| 1101 |
-
]
|
| 1102 |
-
for item in punkts:
|
| 1103 |
-
response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
|
| 1104 |
-
response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
|
| 1105 |
-
return response
|
| 1106 |
-
|
| 1107 |
@torch.no_grad()
|
| 1108 |
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
|
| 1109 |
-
do_sample=True, top_p=0.7, temperature=0.95,
|
| 1110 |
if history is None:
|
| 1111 |
history = []
|
| 1112 |
-
if logits_processor is None:
|
| 1113 |
-
logits_processor = LogitsProcessorList()
|
| 1114 |
-
logits_processor.append(InvalidScoreLogitsProcessor())
|
| 1115 |
gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
|
| 1116 |
-
"temperature": temperature,
|
| 1117 |
if not history:
|
| 1118 |
prompt = query
|
| 1119 |
else:
|
|
@@ -1124,139 +1144,64 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
|
| 1124 |
input_ids = tokenizer([prompt], return_tensors="pt", padding=True)
|
| 1125 |
input_ids = input_ids.to(self.device)
|
| 1126 |
outputs = self.generate(**input_ids, **gen_kwargs)
|
| 1127 |
-
outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):]
|
| 1128 |
response = tokenizer.decode(outputs)
|
| 1129 |
-
response =
|
|
|
|
| 1130 |
history = history + [(query, response)]
|
| 1131 |
return response, history
|
| 1132 |
|
| 1133 |
@torch.no_grad()
|
| 1134 |
-
def
|
| 1135 |
-
do_sample=True, top_p=0.7, temperature=0.95, logits_processor=None, **kwargs):
|
| 1136 |
-
if history is None:
|
| 1137 |
-
history = []
|
| 1138 |
-
if logits_processor is None:
|
| 1139 |
-
logits_processor = LogitsProcessorList()
|
| 1140 |
-
logits_processor.append(InvalidScoreLogitsProcessor())
|
| 1141 |
-
gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
|
| 1142 |
-
"temperature": temperature, "logits_processor": logits_processor, **kwargs}
|
| 1143 |
-
if not history:
|
| 1144 |
-
prompt = query
|
| 1145 |
-
else:
|
| 1146 |
-
prompt = ""
|
| 1147 |
-
for i, (old_query, response) in enumerate(history):
|
| 1148 |
-
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
|
| 1149 |
-
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
|
| 1150 |
-
input_ids = tokenizer([prompt], return_tensors="pt", padding=True)
|
| 1151 |
-
input_ids = input_ids.to(self.device)
|
| 1152 |
-
for outputs in self.stream_generate(**input_ids, **gen_kwargs):
|
| 1153 |
-
outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]):]
|
| 1154 |
-
response = tokenizer.decode(outputs)
|
| 1155 |
-
response = self.process_response(response)
|
| 1156 |
-
new_history = history + [(query, response)]
|
| 1157 |
-
yield response, new_history
|
| 1158 |
-
|
| 1159 |
-
@torch.no_grad()
|
| 1160 |
-
def stream_generate(
|
| 1161 |
self,
|
| 1162 |
-
input_ids,
|
| 1163 |
-
generation_config: Optional[GenerationConfig] = None,
|
| 1164 |
-
logits_processor: Optional[LogitsProcessorList] = None,
|
| 1165 |
-
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
| 1166 |
-
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
|
| 1167 |
**kwargs,
|
| 1168 |
):
|
| 1169 |
-
|
| 1170 |
-
|
| 1171 |
-
if generation_config is None:
|
| 1172 |
-
generation_config = self.generation_config
|
| 1173 |
-
generation_config = copy.deepcopy(generation_config)
|
| 1174 |
-
model_kwargs = generation_config.update(**kwargs)
|
| 1175 |
-
bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
|
| 1176 |
-
|
| 1177 |
-
if isinstance(eos_token_id, int):
|
| 1178 |
-
eos_token_id = [eos_token_id]
|
| 1179 |
-
|
| 1180 |
-
has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
|
| 1181 |
-
if has_default_max_length and generation_config.max_new_tokens is None:
|
| 1182 |
-
warnings.warn(
|
| 1183 |
-
f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
|
| 1184 |
-
"This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
|
| 1185 |
-
" recommend using `max_new_tokens` to control the maximum length of the generation.",
|
| 1186 |
-
UserWarning,
|
| 1187 |
-
)
|
| 1188 |
-
elif generation_config.max_new_tokens is not None:
|
| 1189 |
-
generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
|
| 1190 |
-
if not has_default_max_length:
|
| 1191 |
-
logger.warn(
|
| 1192 |
-
f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
|
| 1193 |
-
f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
|
| 1194 |
-
"Please refer to the documentation for more information. "
|
| 1195 |
-
"(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
|
| 1196 |
-
UserWarning,
|
| 1197 |
-
)
|
| 1198 |
-
|
| 1199 |
-
if input_ids_seq_length >= generation_config.max_length:
|
| 1200 |
-
input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
|
| 1201 |
-
logger.warning(
|
| 1202 |
-
f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
|
| 1203 |
-
f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
|
| 1204 |
-
" increasing `max_new_tokens`."
|
| 1205 |
-
)
|
| 1206 |
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
-
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
|
| 1210 |
|
| 1211 |
-
|
| 1212 |
-
generation_config=generation_config,
|
| 1213 |
-
input_ids_seq_length=input_ids_seq_length,
|
| 1214 |
-
encoder_input_ids=input_ids,
|
| 1215 |
-
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
|
| 1216 |
-
logits_processor=logits_processor,
|
| 1217 |
-
)
|
| 1218 |
|
| 1219 |
-
|
| 1220 |
-
generation_config=generation_config, stopping_criteria=stopping_criteria
|
| 1221 |
-
)
|
| 1222 |
-
logits_warper = self._get_logits_warper(generation_config)
|
| 1223 |
|
| 1224 |
-
unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
|
| 1225 |
-
scores = None
|
| 1226 |
while True:
|
| 1227 |
-
|
| 1228 |
-
|
| 1229 |
-
|
| 1230 |
-
|
| 1231 |
-
|
| 1232 |
-
|
| 1233 |
-
|
| 1234 |
-
|
| 1235 |
-
|
| 1236 |
-
|
| 1237 |
-
|
| 1238 |
-
|
| 1239 |
-
|
| 1240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1241 |
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
if generation_config.do_sample:
|
| 1245 |
-
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
|
| 1246 |
-
else:
|
| 1247 |
-
next_tokens = torch.argmax(probs, dim=-1)
|
| 1248 |
|
| 1249 |
-
|
| 1250 |
-
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
|
| 1251 |
-
model_kwargs = self._update_model_kwargs_for_generation(
|
| 1252 |
-
outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
|
| 1253 |
-
)
|
| 1254 |
-
unfinished_sequences = unfinished_sequences.mul((sum(next_tokens != i for i in eos_token_id)).long())
|
| 1255 |
|
| 1256 |
-
|
| 1257 |
-
if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
|
| 1258 |
-
break
|
| 1259 |
-
yield input_ids
|
| 1260 |
|
| 1261 |
def quantize(self, bits: int):
|
| 1262 |
from .quantization import quantize
|
|
|
|
| 3 |
import math
|
| 4 |
import copy
|
| 5 |
import os
|
|
|
|
|
|
|
| 6 |
import sys
|
| 7 |
|
| 8 |
import torch
|
|
|
|
| 11 |
from torch import nn
|
| 12 |
from torch.nn import CrossEntropyLoss, LayerNorm
|
| 13 |
from torch.nn.utils import skip_init
|
| 14 |
+
from typing import Optional, Tuple, Union, List
|
| 15 |
|
| 16 |
from transformers.utils import (
|
| 17 |
add_code_sample_docstrings,
|
|
|
|
| 24 |
BaseModelOutputWithPastAndCrossAttentions,
|
| 25 |
)
|
| 26 |
from transformers.modeling_utils import PreTrainedModel
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
from transformers.utils import logging
|
| 29 |
from .configuration_chatglm import ChatGLMConfig
|
| 30 |
|
|
|
|
|
|
|
| 31 |
if sys.platform != 'darwin':
|
| 32 |
torch._C._jit_set_profiling_mode(False)
|
| 33 |
torch._C._jit_set_profiling_executor(False)
|
| 34 |
torch._C._jit_override_can_fuse_on_cpu(True)
|
| 35 |
torch._C._jit_override_can_fuse_on_gpu(True)
|
| 36 |
|
| 37 |
+
|
| 38 |
logger = logging.get_logger(__name__)
|
| 39 |
|
| 40 |
_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM-6B"
|
|
|
|
| 46 |
]
|
| 47 |
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
def load_tf_weights_in_chatglm_6b(model, config, tf_checkpoint_path):
|
| 50 |
"""Load tf checkpoints in a pytorch model."""
|
| 51 |
try:
|
|
|
|
| 140 |
if learnable:
|
| 141 |
self.inv_freq = torch.nn.Parameter(inv_freq)
|
| 142 |
self.max_seq_len_cached = None
|
| 143 |
+
else:
|
| 144 |
self.register_buffer('inv_freq', inv_freq)
|
| 145 |
self.max_seq_len_cached = None
|
| 146 |
self.cos_cached = None
|
|
|
|
| 156 |
seq_len = x.shape[seq_dim]
|
| 157 |
if self.max_seq_len_cached is None or (seq_len > self.max_seq_len_cached):
|
| 158 |
self.max_seq_len_cached = None if self.learnable else seq_len
|
| 159 |
+
|
| 160 |
+
t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
|
| 161 |
+
freqs = torch.einsum('i,j->ij', t, self.inv_freq)
|
| 162 |
+
# Different from paper, but it uses a different permutation in order to obtain the same calculation
|
| 163 |
+
emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
|
| 164 |
+
if self.precision == torch.bfloat16:
|
| 165 |
+
emb = emb.float()
|
| 166 |
+
|
| 167 |
+
# [sx, 1 (b * np), hn]
|
| 168 |
+
cos_cached = emb.cos()[:, None, :]
|
| 169 |
+
sin_cached = emb.sin()[:, None, :]
|
| 170 |
+
if self.precision == torch.bfloat16:
|
| 171 |
+
cos_cached = cos_cached.bfloat16()
|
| 172 |
+
sin_cached = sin_cached.bfloat16()
|
| 173 |
+
if self.learnable:
|
| 174 |
+
return cos_cached, sin_cached
|
| 175 |
+
self.cos_cached, self.sin_cached = cos_cached, sin_cached
|
| 176 |
+
|
| 177 |
return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
|
| 178 |
|
| 179 |
|
|
|
|
| 191 |
return q, k
|
| 192 |
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
class SelfAttention(torch.nn.Module):
|
| 196 |
def __init__(self, hidden_size, num_attention_heads,
|
|
|
|
| 280 |
"""
|
| 281 |
|
| 282 |
# [seq_len, batch, 3 * hidden_size]
|
| 283 |
+
mixed_raw_layer = self.query_key_value.to(device=hidden_states.device)(hidden_states)
|
| 284 |
|
| 285 |
# [seq_len, batch, 3 * hidden_size] --> [seq_len, batch, num_attention_heads, 3 * hidden_size_per_attention_head]
|
| 286 |
new_tensor_shape = mixed_raw_layer.size()[:-1] + (
|
|
|
|
| 295 |
if self.position_encoding_2d:
|
| 296 |
q1, q2 = query_layer.chunk(2, dim=(query_layer.ndim - 1))
|
| 297 |
k1, k2 = key_layer.chunk(2, dim=(key_layer.ndim - 1))
|
| 298 |
+
position_ids = position_ids.to(q1.device)
|
| 299 |
cos, sin = self.rotary_emb(q1, seq_len=position_ids.max() + 1)
|
| 300 |
position_ids, block_position_ids = position_ids[:, 0, :].transpose(0, 1).contiguous(), \
|
| 301 |
position_ids[:, 1, :].transpose(0, 1).contiguous()
|
|
|
|
| 305 |
key_layer = torch.concat([k1, k2], dim=(k1.ndim - 1))
|
| 306 |
else:
|
| 307 |
position_ids = position_ids.transpose(0, 1)
|
| 308 |
+
position_ids = position_ids.to(value_layer.device)
|
| 309 |
cos, sin = self.rotary_emb(value_layer, seq_len=position_ids.max() + 1)
|
| 310 |
# [seq_len, batch, num_attention_heads, hidden_size_per_attention_head]
|
| 311 |
query_layer, key_layer = apply_rotary_pos_emb_index(query_layer, key_layer, cos, sin, position_ids)
|
| 312 |
|
| 313 |
# [seq_len, batch, hidden_size]
|
| 314 |
+
context_layer, present, attention_probs = self.attention_fn(
|
|
|
|
| 315 |
query_layer=query_layer,
|
| 316 |
key_layer=key_layer,
|
| 317 |
value_layer=value_layer,
|
| 318 |
+
attention_mask=attention_mask.to(query_layer.device),
|
| 319 |
hidden_size_per_partition=self.hidden_size_per_partition,
|
| 320 |
layer_id=layer_id,
|
| 321 |
layer_past=layer_past,
|
| 322 |
use_cache=use_cache
|
| 323 |
)
|
| 324 |
+
# print("*"*80)
|
| 325 |
+
# print(f"{context_layer.device = }")
|
| 326 |
+
# print(f"{self.dense.weight.device = }")
|
| 327 |
|
| 328 |
output = self.dense(context_layer)
|
| 329 |
|
|
|
|
| 334 |
|
| 335 |
return outputs # output, present, attention_probs
|
| 336 |
|
| 337 |
+
def attention_fn(
|
| 338 |
+
self,
|
| 339 |
+
query_layer,
|
| 340 |
+
key_layer,
|
| 341 |
+
value_layer,
|
| 342 |
+
attention_mask,
|
| 343 |
+
hidden_size_per_partition,
|
| 344 |
+
layer_id,
|
| 345 |
+
layer_past=None,
|
| 346 |
+
scaling_attention_score=True,
|
| 347 |
+
use_cache=False,
|
| 348 |
+
):
|
| 349 |
+
if layer_past is not None:
|
| 350 |
+
past_key, past_value = layer_past
|
| 351 |
+
key_layer = torch.cat((past_key, key_layer), dim=0)
|
| 352 |
+
value_layer = torch.cat((past_value, value_layer), dim=0)
|
| 353 |
+
|
| 354 |
+
# seqlen, batch, num_attention_heads, hidden_size_per_attention_head
|
| 355 |
+
seq_len, b, nh, hidden_size = key_layer.shape
|
| 356 |
+
|
| 357 |
+
if use_cache:
|
| 358 |
+
present = (key_layer, value_layer)
|
| 359 |
+
else:
|
| 360 |
+
present = None
|
| 361 |
+
|
| 362 |
+
query_key_layer_scaling_coeff = float(layer_id + 1)
|
| 363 |
+
if scaling_attention_score:
|
| 364 |
+
query_layer = query_layer / (math.sqrt(hidden_size) * query_key_layer_scaling_coeff)
|
| 365 |
+
|
| 366 |
+
# ===================================
|
| 367 |
+
# Raw attention scores. [b, np, s, s]
|
| 368 |
+
# ===================================
|
| 369 |
+
|
| 370 |
+
# [b, np, sq, sk]
|
| 371 |
+
output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
|
| 372 |
+
|
| 373 |
+
# [sq, b, np, hn] -> [sq, b * np, hn]
|
| 374 |
+
query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
|
| 375 |
+
# [sk, b, np, hn] -> [sk, b * np, hn]
|
| 376 |
+
key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
|
| 377 |
+
|
| 378 |
+
matmul_result = torch.empty(
|
| 379 |
+
output_size[0] * output_size[1],
|
| 380 |
+
output_size[2],
|
| 381 |
+
output_size[3],
|
| 382 |
+
dtype=query_layer.dtype,
|
| 383 |
+
device=query_layer.device,
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
matmul_result = torch.baddbmm(
|
| 387 |
+
matmul_result,
|
| 388 |
+
query_layer.transpose(0, 1), # [b * np, sq, hn]
|
| 389 |
+
key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk]
|
| 390 |
+
beta=0.0,
|
| 391 |
+
alpha=1.0,
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
# change view to [b, np, sq, sk]
|
| 395 |
+
attention_scores = matmul_result.view(*output_size)
|
| 396 |
+
|
| 397 |
+
if self.scale_mask_softmax:
|
| 398 |
+
self.scale_mask_softmax.scale = query_key_layer_scaling_coeff
|
| 399 |
+
attention_probs = self.scale_mask_softmax(attention_scores, attention_mask.contiguous())
|
| 400 |
+
else:
|
| 401 |
+
# print("*"*80)
|
| 402 |
+
# print(f"{attention_mask.device = }")
|
| 403 |
+
# print(f"{attention_scores.device = }")
|
| 404 |
+
if not (attention_mask == 0).all():
|
| 405 |
+
# if auto-regressive, skip
|
| 406 |
+
attention_scores.masked_fill_(attention_mask, -10000.0)
|
| 407 |
+
dtype = attention_scores.type()
|
| 408 |
+
attention_scores = attention_scores.float()
|
| 409 |
+
attention_scores = attention_scores * query_key_layer_scaling_coeff
|
| 410 |
+
|
| 411 |
+
attention_probs = F.softmax(attention_scores, dim=-1)
|
| 412 |
+
|
| 413 |
+
attention_probs = attention_probs.type(dtype)
|
| 414 |
+
|
| 415 |
+
# =========================
|
| 416 |
+
# Context layer. [sq, b, hp]
|
| 417 |
+
# =========================
|
| 418 |
+
|
| 419 |
+
# value_layer -> context layer.
|
| 420 |
+
# [sk, b, np, hn] --> [b, np, sq, hn]
|
| 421 |
+
|
| 422 |
+
# context layer shape: [b, np, sq, hn]
|
| 423 |
+
output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
|
| 424 |
+
|
| 425 |
+
# change view [sk, b * np, hn]
|
| 426 |
+
value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
|
| 427 |
+
|
| 428 |
+
# change view [b * np, sq, sk]
|
| 429 |
+
attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
|
| 430 |
+
|
| 431 |
+
# matmul: [b * np, sq, hn]
|
| 432 |
+
context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
|
| 433 |
+
|
| 434 |
+
# change view [b, np, sq, hn]
|
| 435 |
+
context_layer = context_layer.view(*output_size)
|
| 436 |
+
|
| 437 |
+
# [b, np, sq, hn] --> [sq, b, np, hn]
|
| 438 |
+
context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
|
| 439 |
+
|
| 440 |
+
# [sq, b, np, hn] --> [sq, b, hp]
|
| 441 |
+
new_context_layer_shape = context_layer.size()[:-2] + (hidden_size_per_partition,)
|
| 442 |
+
context_layer = context_layer.view(*new_context_layer_shape)
|
| 443 |
+
|
| 444 |
+
outputs = (context_layer, present, attention_probs)
|
| 445 |
+
|
| 446 |
+
return outputs
|
| 447 |
+
|
| 448 |
+
|
| 449 |
|
| 450 |
class GEGLU(torch.nn.Module):
|
| 451 |
def __init__(self):
|
|
|
|
| 611 |
a simple interface for downloading and loading pretrained models.
|
| 612 |
"""
|
| 613 |
|
| 614 |
+
is_parallelizable = True
|
| 615 |
+
model_parallel = False
|
| 616 |
supports_gradient_checkpointing = False
|
| 617 |
config_class = ChatGLMConfig
|
| 618 |
base_model_prefix = "transformer"
|
|
|
|
| 722 |
self.hidden_size_per_attention_head = self.hidden_size // self.num_attention_heads
|
| 723 |
self.position_encoding_2d = config.position_encoding_2d
|
| 724 |
|
| 725 |
+
self.gradient_checkpointing = True # 默认打开 用来节约显存
|
| 726 |
+
|
| 727 |
self.word_embeddings = skip_init(
|
| 728 |
torch.nn.Embedding,
|
| 729 |
num_embeddings=self.vocab_size, embedding_dim=self.hidden_size,
|
|
|
|
| 757 |
def set_input_embeddings(self, new_embeddings: torch.Tensor):
|
| 758 |
self.word_embeddings = new_embeddings
|
| 759 |
|
| 760 |
+
@staticmethod
|
| 761 |
+
def get_masks(seq, device):
|
| 762 |
+
context_length = seq.index(150004) + 1
|
| 763 |
|
| 764 |
attention_mask = torch.ones((1, len(seq), len(seq)), device=device)
|
| 765 |
attention_mask.tril_()
|
|
|
|
| 770 |
return attention_mask
|
| 771 |
|
| 772 |
def get_position_ids(self, seq, mask_position, device, gmask=False):
|
| 773 |
+
context_length = seq.index(150004) + 1
|
| 774 |
if self.position_encoding_2d:
|
| 775 |
+
seq_length = seq.index(150004)
|
| 776 |
position_ids = torch.arange(context_length, dtype=torch.long, device=device)
|
| 777 |
if not gmask:
|
| 778 |
position_ids[seq_length:] = mask_position
|
|
|
|
| 827 |
|
| 828 |
if past_key_values is None:
|
| 829 |
past_key_values = tuple([None] * len(self.layers))
|
| 830 |
+
|
| 831 |
+
MASK, gMASK = 150000, 150001
|
| 832 |
+
mask_token = MASK if MASK in input_ids else gMASK
|
| 833 |
+
use_gmask = False if MASK in input_ids else gMASK
|
| 834 |
seq = input_ids[0].tolist()
|
| 835 |
|
| 836 |
+
mask_position = seq.index(mask_token)
|
| 837 |
+
|
| 838 |
if attention_mask is None:
|
| 839 |
attention_mask = self.get_masks(
|
| 840 |
seq=seq,
|
|
|
|
| 842 |
)
|
| 843 |
|
| 844 |
if position_ids is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 845 |
position_ids = self.get_position_ids(
|
| 846 |
seq=seq,
|
| 847 |
mask_position=mask_position,
|
|
|
|
| 850 |
)
|
| 851 |
|
| 852 |
if inputs_embeds is None:
|
| 853 |
+
# print("*"*80)
|
| 854 |
+
# print(f"{input_ids.device = }")
|
| 855 |
+
# print(f"{self.word_embeddings.weight.device = }")
|
| 856 |
+
inputs_embeds = self.word_embeddings(input_ids.to(self.word_embeddings.weight.device))
|
| 857 |
|
| 858 |
# [seq_len, batch, hidden_size]
|
| 859 |
hidden_states = inputs_embeds.transpose(0, 1)
|
| 860 |
|
| 861 |
+
|
| 862 |
+
if self.gradient_checkpointing and self.training:
|
| 863 |
+
if use_cache:
|
| 864 |
+
logger.warning_once(
|
| 865 |
+
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
| 866 |
+
)
|
| 867 |
+
use_cache = False
|
| 868 |
+
|
| 869 |
presents = () if use_cache else None
|
| 870 |
all_self_attentions = () if output_attentions else None
|
| 871 |
all_hidden_states = () if output_hidden_states else None
|
| 872 |
|
| 873 |
+
|
| 874 |
+
|
| 875 |
seq_length_with_past = seq_length
|
| 876 |
past_key_values_length = 0
|
| 877 |
if past_key_values[0] is not None:
|
|
|
|
| 888 |
if output_hidden_states:
|
| 889 |
all_hidden_states = all_hidden_states + (hidden_states,)
|
| 890 |
|
| 891 |
+
if self.gradient_checkpointing and self.training:
|
| 892 |
+
# https://mathpretty.com/11156.html
|
| 893 |
+
use_cache = False
|
| 894 |
+
def create_custom_forward(module):
|
| 895 |
+
def custom_forward(*inputs):
|
| 896 |
+
# None for past_key_value
|
| 897 |
+
return module(*inputs, use_cache, output_attentions)
|
| 898 |
+
|
| 899 |
+
return custom_forward
|
| 900 |
+
|
| 901 |
+
layer_ret = torch.utils.checkpoint.checkpoint(
|
| 902 |
+
create_custom_forward(layer),
|
| 903 |
+
# create_custom_forward(layer),
|
| 904 |
+
hidden_states,
|
| 905 |
+
position_ids,
|
| 906 |
+
attention_mask,
|
| 907 |
+
torch.ones(1, dtype=torch.float32, requires_grad=True) * i,
|
| 908 |
+
# torch.tensor(i, requires_grad=True),
|
| 909 |
+
past_key_values[i],
|
| 910 |
+
|
| 911 |
+
)
|
| 912 |
+
|
| 913 |
+
else:
|
| 914 |
+
|
| 915 |
+
layer_ret = layer(
|
| 916 |
+
hidden_states,
|
| 917 |
+
position_ids=position_ids,
|
| 918 |
+
attention_mask=attention_mask,
|
| 919 |
+
layer_id=torch.tensor(i),
|
| 920 |
+
layer_past=past_key_values[i],
|
| 921 |
+
use_cache=use_cache,
|
| 922 |
+
output_attentions=output_attentions
|
| 923 |
+
)
|
| 924 |
|
| 925 |
hidden_states = layer_ret[0]
|
| 926 |
|
|
|
|
| 967 |
bias=False,
|
| 968 |
dtype=torch.half
|
| 969 |
)
|
| 970 |
+
self.model_parallel = False
|
| 971 |
|
| 972 |
def get_output_embeddings(self):
|
| 973 |
return self.lm_head
|
|
|
|
| 983 |
attention_mask = (attention_mask < 0.5).bool()
|
| 984 |
|
| 985 |
if self.position_encoding_2d:
|
| 986 |
+
seq_length = seq.index(150004)
|
| 987 |
position_ids = torch.arange(context_length, dtype=torch.long, device=device)
|
| 988 |
if not gmask:
|
| 989 |
position_ids[seq_length:] = mask_position
|
|
|
|
| 1021 |
|
| 1022 |
# only last token for input_ids if past is not None
|
| 1023 |
if past is not None or past_key_values is not None:
|
| 1024 |
+
context_length = seq.index(150004)
|
| 1025 |
last_token = input_ids[:, -1].unsqueeze(-1)
|
| 1026 |
if self.position_encoding_2d:
|
| 1027 |
position_ids = torch.tensor([[[mask_position], [len(seq) - context_length]]], dtype=torch.long,
|
|
|
|
| 1093 |
shift_labels = labels[..., 1:].contiguous()
|
| 1094 |
# Flatten the tokens
|
| 1095 |
loss_fct = CrossEntropyLoss()
|
| 1096 |
+
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)).to(shift_labels.device), shift_labels.view(-1))
|
| 1097 |
|
|
|
|
|
|
|
| 1098 |
|
| 1099 |
if not return_dict:
|
| 1100 |
output = (lm_logits,) + transformer_outputs[1:]
|
|
|
|
| 1127 |
for layer_past in past
|
| 1128 |
)
|
| 1129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1130 |
@torch.no_grad()
|
| 1131 |
def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, max_length: int = 2048, num_beams=1,
|
| 1132 |
+
do_sample=True, top_p=0.7, temperature=0.95, **kwargs):
|
| 1133 |
if history is None:
|
| 1134 |
history = []
|
|
|
|
|
|
|
|
|
|
| 1135 |
gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
|
| 1136 |
+
"temperature": temperature, **kwargs}
|
| 1137 |
if not history:
|
| 1138 |
prompt = query
|
| 1139 |
else:
|
|
|
|
| 1144 |
input_ids = tokenizer([prompt], return_tensors="pt", padding=True)
|
| 1145 |
input_ids = input_ids.to(self.device)
|
| 1146 |
outputs = self.generate(**input_ids, **gen_kwargs)
|
| 1147 |
+
outputs = outputs.tolist()[0][len(input_ids["input_ids"][0]) - 2:]
|
| 1148 |
response = tokenizer.decode(outputs)
|
| 1149 |
+
response = response.strip()
|
| 1150 |
+
response = response.replace("[[训练时间]]", "2023年")
|
| 1151 |
history = history + [(query, response)]
|
| 1152 |
return response, history
|
| 1153 |
|
| 1154 |
@torch.no_grad()
|
| 1155 |
+
def generate(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1156 |
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1157 |
**kwargs,
|
| 1158 |
):
|
| 1159 |
+
MASK, gMASK = 150000, 150001
|
| 1160 |
+
bos, eos = 150004, 150005
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1161 |
|
| 1162 |
+
if "eos_token_id" not in kwargs:
|
| 1163 |
+
kwargs["eos_token_id"] = eos
|
|
|
|
| 1164 |
|
| 1165 |
+
stop = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1166 |
|
| 1167 |
+
return_seqs = []
|
|
|
|
|
|
|
|
|
|
| 1168 |
|
|
|
|
|
|
|
| 1169 |
while True:
|
| 1170 |
+
print(kwargs)
|
| 1171 |
+
output_ids = super().generate(**kwargs)
|
| 1172 |
+
|
| 1173 |
+
return_seqs = []
|
| 1174 |
+
max_length = 0
|
| 1175 |
+
|
| 1176 |
+
for i in range(output_ids.shape[0]):
|
| 1177 |
+
output_seq = output_ids[i].tolist()
|
| 1178 |
+
mask_token = MASK if MASK in output_seq else gMASK
|
| 1179 |
+
mask_position = output_seq.index(mask_token)
|
| 1180 |
+
bos_position = output_seq.index(bos)
|
| 1181 |
+
if eos in output_seq:
|
| 1182 |
+
eos_position = output_seq.index(eos)
|
| 1183 |
+
else:
|
| 1184 |
+
eos_position = len(output_seq)
|
| 1185 |
+
|
| 1186 |
+
return_seq = output_seq[:mask_position] + output_seq[bos_position + 1:eos_position] + output_seq[
|
| 1187 |
+
mask_position + 1:bos_position]
|
| 1188 |
+
max_length = max(max_length, len(return_seq))
|
| 1189 |
+
return_seqs.append(return_seq)
|
| 1190 |
+
|
| 1191 |
+
for i in range(output_ids.shape[0]):
|
| 1192 |
+
return_seqs[i] = [0] * (max_length - len(return_seqs[i])) + return_seqs[i] # padding
|
| 1193 |
+
if mask_token not in return_seqs[i]:
|
| 1194 |
+
stop = True
|
| 1195 |
+
|
| 1196 |
+
if stop:
|
| 1197 |
+
break
|
| 1198 |
|
| 1199 |
+
for return_seq in return_seqs:
|
| 1200 |
+
return_seq += [bos]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1201 |
|
| 1202 |
+
kwargs['input_ids'] = torch.tensor(return_seqs, dtype=torch.long, device=kwargs['input_ids'].device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1203 |
|
| 1204 |
+
return torch.tensor(return_seqs, dtype=torch.long, device=kwargs['input_ids'].device)
|
|
|
|
|
|
|
|
|
|
| 1205 |
|
| 1206 |
def quantize(self, bits: int):
|
| 1207 |
from .quantization import quantize
|