TenFate commited on
Commit
1d95bcf
·
verified ·
1 Parent(s): 9937a04

Delete modeling_zdxdllm.py

Browse files
Files changed (1) hide show
  1. modeling_zdxdllm.py +0 -1143
modeling_zdxdllm.py DELETED
@@ -1,1143 +0,0 @@
1
- import json
2
- import math
3
- import copy
4
- import warnings
5
- import re
6
- import sys
7
-
8
- import torch
9
- import torch.utils.checkpoint
10
- import torch.nn.functional as F
11
- from torch import nn
12
- from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
13
- from torch.nn.utils import skip_init
14
- from typing import Optional, Tuple, Union, List, Callable, Dict, Any
15
- from copy import deepcopy
16
-
17
- from transformers.modeling_outputs import (
18
- BaseModelOutputWithPast,
19
- CausalLMOutputWithPast,
20
- SequenceClassifierOutputWithPast,
21
- )
22
- from transformers.modeling_utils import PreTrainedModel
23
- from transformers.utils import logging, is_torch_npu_available
24
- from transformers.generation.logits_process import LogitsProcessor
25
- from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
26
-
27
- from .configuration_zdxdllm import ChatGLMConfig
28
-
29
- try:
30
- from transformers.utils import is_flash_attn_greater_or_equal_2_10, is_flash_attn_2_available
31
- if is_flash_attn_2_available():
32
- from flash_attn import flash_attn_func, flash_attn_varlen_func
33
- from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
34
- except:
35
- pass
36
-
37
-
38
- # flags required to enable jit fusion kernels
39
-
40
- if sys.platform != 'darwin' and not is_torch_npu_available():
41
- torch._C._jit_set_profiling_mode(False)
42
- torch._C._jit_set_profiling_executor(False)
43
- torch._C._jit_override_can_fuse_on_cpu(True)
44
- torch._C._jit_override_can_fuse_on_gpu(True)
45
-
46
- logger = logging.get_logger(__name__)
47
-
48
- _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
49
- _CONFIG_FOR_DOC = "ChatGLMConfig"
50
-
51
-
52
- def default_init(cls, *args, **kwargs):
53
- return cls(*args, **kwargs)
54
-
55
-
56
- class InvalidScoreLogitsProcessor(LogitsProcessor):
57
- def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
58
- if torch.isnan(scores).any() or torch.isinf(scores).any():
59
- scores.zero_()
60
- scores[..., 198] = 5e4
61
- return scores
62
-
63
-
64
- def split_tensor_along_last_dim(
65
- tensor: torch.Tensor,
66
- num_partitions: int,
67
- contiguous_split_chunks: bool = False,
68
- ) -> List[torch.Tensor]:
69
- """Split a tensor along its last dimension.
70
-
71
- Arguments:
72
- tensor: input tensor.
73
- num_partitions: number of partitions to split the tensor
74
- contiguous_split_chunks: If True, make each chunk contiguous
75
- in memory.
76
-
77
- Returns:
78
- A list of Tensors
79
- """
80
- # Get the size and dimension.
81
- last_dim = tensor.dim() - 1
82
- last_dim_size = tensor.size()[last_dim] // num_partitions
83
- # Split.
84
- tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
85
- # Note: torch.split does not create contiguous tensors by default.
86
- if contiguous_split_chunks:
87
- return tuple(chunk.contiguous() for chunk in tensor_list)
88
-
89
- return tensor_list
90
-
91
-
92
- class RotaryEmbedding(nn.Module):
93
- def __init__(self, dim, rope_ratio=1, original_impl=False, device=None, dtype=None):
94
- super().__init__()
95
- inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
96
- self.register_buffer("inv_freq", inv_freq)
97
- self.dim = dim
98
- self.original_impl = original_impl
99
- self.rope_ratio = rope_ratio
100
-
101
- def forward_impl(
102
- self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
103
- ):
104
- """Enhanced Transformer with Rotary Position Embedding.
105
-
106
- Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
107
- transformers/rope/__init__.py. MIT License:
108
- https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
109
- """
110
- # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
111
- base = base * self.rope_ratio
112
- theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
113
-
114
- # Create position indexes `[0, 1, ..., seq_len - 1]`
115
- seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
116
-
117
- # Calculate the product of position index and $\theta_i$
118
- idx_theta = torch.outer(seq_idx, theta).float()
119
-
120
- cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
121
-
122
- # this is to mimic the behaviour of complex32, else we will get different results
123
- if dtype in (torch.float16, torch.bfloat16, torch.int8):
124
- cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
125
- return cache
126
-
127
- def forward(self, max_seq_len, offset=0):
128
- return self.forward_impl(
129
- max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
130
- )
131
-
132
-
133
- @torch.jit.script
134
- def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
135
- # x: [b, np, sq, hn]
136
- b, np, sq, hn = x.size(0), x.size(1), x.size(2), x.size(3)
137
- rot_dim = rope_cache.shape[-2] * 2
138
- x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
139
- # truncate to support variable sizes
140
- rope_cache = rope_cache[:, :sq]
141
- xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
142
- rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
143
- x_out2 = torch.stack(
144
- [
145
- xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
146
- xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
147
- ],
148
- -1,
149
- )
150
- x_out2 = x_out2.flatten(3)
151
- return torch.cat((x_out2, x_pass), dim=-1)
152
-
153
-
154
- class RMSNorm(torch.nn.Module):
155
- def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
156
- super().__init__()
157
- self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
158
- self.eps = eps
159
-
160
- def forward(self, hidden_states: torch.Tensor):
161
- input_dtype = hidden_states.dtype
162
- variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
163
- hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
164
-
165
- return (self.weight * hidden_states).to(input_dtype)
166
-
167
-
168
- class CoreAttention(torch.nn.Module):
169
- def __init__(self, config: ChatGLMConfig, layer_number):
170
- super(CoreAttention, self).__init__()
171
- self.config = config
172
- self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
173
- self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
174
- if self.apply_query_key_layer_scaling:
175
- self.attention_softmax_in_fp32 = True
176
- self.layer_number = max(1, layer_number)
177
- self.is_causal = True
178
-
179
- projection_size = config.kv_channels * config.num_attention_heads
180
-
181
- # Per attention head and per partition values.
182
- self.hidden_size_per_partition = projection_size
183
- self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
184
- self.num_attention_heads_per_partition = config.num_attention_heads
185
-
186
- coeff = None
187
- self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
188
- if self.apply_query_key_layer_scaling:
189
- coeff = self.layer_number
190
- self.norm_factor *= coeff
191
- self.coeff = coeff
192
-
193
- self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
194
-
195
- def forward(self, query_layer, key_layer, value_layer, attention_mask):
196
- # [b, np, sq, sk]
197
- output_size = (query_layer.size(0), query_layer.size(1), query_layer.size(2), key_layer.size(2))
198
-
199
- # [b, np, sq, hn] -> [b * np, sq, hn]
200
- query_layer = query_layer.view(output_size[0] * output_size[1], output_size[2], -1)
201
- # [b, np, sk, hn] -> [b * np, sk, hn]
202
- key_layer = key_layer.view(output_size[0] * output_size[1], output_size[3], -1)
203
-
204
- # preallocting input tensor: [b * np, sq, sk]
205
- matmul_input_buffer = torch.empty(
206
- output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
207
- device=query_layer.device
208
- )
209
-
210
- # Raw attention scores. [b * np, sq, sk]
211
- matmul_result = torch.baddbmm(
212
- matmul_input_buffer,
213
- query_layer, # [b * np, sq, hn]
214
- key_layer.transpose(1, 2), # [b * np, hn, sk]
215
- beta=0.0,
216
- alpha=(1.0 / self.norm_factor),
217
- )
218
-
219
- # change view to [b, np, sq, sk]
220
- attention_scores = matmul_result.view(*output_size)
221
-
222
- # ===========================
223
- # Attention probs and dropout
224
- # ===========================
225
-
226
- # attention scores and attention mask [b, np, sq, sk]
227
- if self.attention_softmax_in_fp32:
228
- attention_scores = attention_scores.float()
229
- if self.coeff is not None:
230
- attention_scores = attention_scores * self.coeff
231
- if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
232
- attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
233
- device=attention_scores.device, dtype=torch.bool)
234
- attention_mask.tril_()
235
- attention_mask = ~attention_mask
236
- if attention_mask is not None:
237
- attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
238
- attention_probs = F.softmax(attention_scores, dim=-1)
239
- attention_probs = attention_probs.type_as(value_layer)
240
-
241
- # This is actually dropping out entire tokens to attend to, which might
242
- # seem a bit unusual, but is taken from the original Transformer paper.
243
- attention_probs = self.attention_dropout(attention_probs)
244
-
245
- # query layer shape: [b * np, sq, hn]
246
- # value layer shape: [b, np, sk, hn]
247
- # attention shape: [b, np, sq, sk]
248
- # context layer shape: [b, np, sq, hn]
249
- output_size = (value_layer.size(0), value_layer.size(1), query_layer.size(1), value_layer.size(3))
250
- # change view [b * np, sk, hn]
251
- value_layer = value_layer.view(output_size[0] * output_size[1], value_layer.size(2), -1)
252
- # change view [b * np, sq, sk]
253
- attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
254
- # matmul: [b * np, sq, hn]
255
- context_layer = torch.bmm(attention_probs, value_layer)
256
- # change view [b, np, sq, hn]
257
- context_layer = context_layer.view(*output_size)
258
- # [b, np, sq, hn] --> [b, sq, np, hn]
259
- context_layer = context_layer.transpose(1, 2).contiguous()
260
- # [b, sq, np, hn] --> [b, sq, hp]
261
- new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
262
- context_layer = context_layer.reshape(*new_context_layer_shape)
263
-
264
- return context_layer
265
-
266
-
267
- class SdpaAttention(CoreAttention):
268
- def forward(self, query_layer, key_layer, value_layer, attention_mask):
269
- if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
270
- context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
271
- is_causal=True,
272
- dropout_p=self.config.attention_dropout if self.training else 0.0)
273
- else:
274
- if attention_mask is not None:
275
- attention_mask = ~attention_mask
276
- context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
277
- attention_mask,
278
- dropout_p=self.config.attention_dropout if self.training else 0.0)
279
- context_layer = context_layer.transpose(1, 2).contiguous()
280
- new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
281
- context_layer = context_layer.reshape(*new_context_layer_shape)
282
- return context_layer
283
-
284
-
285
- def _get_unpad_data(attention_mask):
286
- seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
287
- indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
288
- max_seqlen_in_batch = seqlens_in_batch.max().item()
289
- cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
290
- return (
291
- indices,
292
- cu_seqlens,
293
- max_seqlen_in_batch,
294
- )
295
-
296
-
297
- # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2
298
- class FlashAttention2(CoreAttention):
299
- def __init__(self, *args, **kwargs):
300
- super().__init__(*args, **kwargs)
301
- self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
302
-
303
- def forward(self, query_states, key_states, value_states, attention_mask):
304
- query_states = query_states.transpose(1, 2)
305
- key_states = key_states.transpose(1, 2)
306
- value_states = value_states.transpose(1, 2)
307
- batch_size, query_length = query_states.shape[:2]
308
- if not self._flash_attn_uses_top_left_mask:
309
- causal = self.is_causal
310
- else:
311
- # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
312
- causal = self.is_causal and query_length != 1
313
- dropout = self.config.attention_dropout if self.training else 0.0
314
- # Contains at least one padding token in the sequence
315
- if attention_mask is not None:
316
- query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
317
- query_states, key_states, value_states, attention_mask, query_length
318
- )
319
-
320
- cu_seqlens_q, cu_seqlens_k = cu_seq_lens
321
- max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
322
-
323
- attn_output_unpad = flash_attn_varlen_func(
324
- query_states,
325
- key_states,
326
- value_states,
327
- cu_seqlens_q=cu_seqlens_q,
328
- cu_seqlens_k=cu_seqlens_k,
329
- max_seqlen_q=max_seqlen_in_batch_q,
330
- max_seqlen_k=max_seqlen_in_batch_k,
331
- dropout_p=dropout,
332
- softmax_scale=None,
333
- causal=causal,
334
- )
335
-
336
- attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
337
- else:
338
- attn_output = flash_attn_func(
339
- query_states, key_states, value_states, dropout, softmax_scale=None, causal=causal
340
- )
341
- attn_output = attn_output.reshape(batch_size, query_length, self.hidden_size_per_partition).contiguous()
342
- return attn_output
343
-
344
- def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
345
- indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
346
- batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
347
-
348
- key_layer = index_first_axis(
349
- key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
350
- )
351
- value_layer = index_first_axis(
352
- value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
353
- )
354
- if query_length == kv_seq_len:
355
- query_layer = index_first_axis(
356
- query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads_per_partition, head_dim), indices_k
357
- )
358
- cu_seqlens_q = cu_seqlens_k
359
- max_seqlen_in_batch_q = max_seqlen_in_batch_k
360
- indices_q = indices_k
361
- elif query_length == 1:
362
- max_seqlen_in_batch_q = 1
363
- cu_seqlens_q = torch.arange(
364
- batch_size + 1, dtype=torch.int32, device=query_layer.device
365
- ) # There is a memcpy here, that is very bad.
366
- indices_q = cu_seqlens_q[:-1]
367
- query_layer = query_layer.squeeze(1)
368
- else:
369
- # The -q_len: slice assumes left padding.
370
- attention_mask = attention_mask[:, -query_length:]
371
- query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
372
-
373
- return (
374
- query_layer,
375
- key_layer,
376
- value_layer,
377
- indices_q,
378
- (cu_seqlens_q, cu_seqlens_k),
379
- (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
380
- )
381
-
382
-
383
- CORE_ATTENTION_CLASSES = {
384
- "eager": CoreAttention,
385
- "sdpa": SdpaAttention,
386
- "flash_attention_2": FlashAttention2
387
- }
388
-
389
-
390
- class SelfAttention(torch.nn.Module):
391
- """Parallel self-attention layer abstract class.
392
-
393
- Self-attention layer takes input with size [s, b, h]
394
- and returns output of the same size.
395
- """
396
-
397
- def __init__(self, config: ChatGLMConfig, layer_number, device=None):
398
- super(SelfAttention, self).__init__()
399
- self.layer_number = max(1, layer_number)
400
-
401
- self.projection_size = config.kv_channels * config.num_attention_heads
402
-
403
- # Per attention head and per partition values.
404
- self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
405
- self.num_attention_heads_per_partition = config.num_attention_heads
406
-
407
- self.multi_query_attention = config.multi_query_attention
408
- self.qkv_hidden_size = 3 * self.projection_size
409
- if self.multi_query_attention:
410
- self.num_multi_query_groups_per_partition = config.multi_query_group_num
411
- self.qkv_hidden_size = (
412
- self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
413
- )
414
- self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
415
- bias=config.add_bias_linear or config.add_qkv_bias,
416
- device=device, **_config_to_kwargs(config)
417
- )
418
-
419
- self.core_attention = CORE_ATTENTION_CLASSES[config._attn_implementation](config, self.layer_number)
420
-
421
- # Output.
422
- self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
423
- device=device, **_config_to_kwargs(config)
424
- )
425
-
426
- def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
427
- if self.multi_query_attention:
428
- num_attention_heads = self.num_multi_query_groups_per_partition
429
- else:
430
- num_attention_heads = self.num_attention_heads_per_partition
431
- return torch.empty(
432
- inference_max_sequence_len,
433
- batch_size,
434
- num_attention_heads,
435
- self.hidden_size_per_attention_head,
436
- dtype=dtype,
437
- device=device,
438
- )
439
-
440
- def forward(
441
- self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
442
- ):
443
- # hidden_states: [b, sq, h]
444
-
445
- # =================================================
446
- # Pre-allocate memory for key-values for inference.
447
- # =================================================
448
- # =====================
449
- # Query, Key, and Value
450
- # =====================
451
-
452
- # Attention heads [b, sq, h] --> [b, sq, (np * 3 * hn)]
453
- mixed_x_layer = self.query_key_value(hidden_states)
454
-
455
- if self.multi_query_attention:
456
- (query_layer, key_layer, value_layer) = mixed_x_layer.split(
457
- [
458
- self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
459
- self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
460
- self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
461
- ],
462
- dim=-1,
463
- )
464
- query_layer = query_layer.view(
465
- query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
466
- )
467
- key_layer = key_layer.view(
468
- key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
469
- )
470
- value_layer = value_layer.view(
471
- value_layer.size()[:-1]
472
- + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
473
- )
474
- else:
475
- new_tensor_shape = mixed_x_layer.size()[:-1] + \
476
- (self.num_attention_heads_per_partition,
477
- 3 * self.hidden_size_per_attention_head)
478
- mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
479
-
480
- # [b, sq, np, 3 * hn] --> 3 [b, sq, np, hn]
481
- (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
482
-
483
- # [b, sq, np, hn] -> [b, np, sq, hn]
484
- query_layer, key_layer, value_layer = [k.transpose(1, 2) for k in [query_layer, key_layer, value_layer]]
485
-
486
- # apply relative positional encoding (rotary embedding)
487
- if rotary_pos_emb is not None:
488
- query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
489
- key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
490
-
491
- # adjust key and value for inference
492
- if kv_cache is not None:
493
- cache_k, cache_v = kv_cache
494
- key_layer = torch.cat((cache_k, key_layer), dim=2)
495
- value_layer = torch.cat((cache_v, value_layer), dim=2)
496
- if use_cache:
497
- if kv_cache is None:
498
- kv_cache = torch.cat((key_layer.unsqueeze(0).unsqueeze(0), value_layer.unsqueeze(0).unsqueeze(0)),
499
- dim=1)
500
- else:
501
- kv_cache = (key_layer, value_layer)
502
- else:
503
- kv_cache = None
504
-
505
- if self.multi_query_attention:
506
- key_layer = key_layer.unsqueeze(2)
507
- key_layer = key_layer.expand(
508
- -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
509
- )
510
- key_layer = key_layer.contiguous().view(
511
- key_layer.size()[:1] + (self.num_attention_heads_per_partition,) + key_layer.size()[3:]
512
- )
513
- value_layer = value_layer.unsqueeze(2)
514
- value_layer = value_layer.expand(
515
- -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1, -1
516
- )
517
- value_layer = value_layer.contiguous().view(
518
- value_layer.size()[:1] + (self.num_attention_heads_per_partition,) + value_layer.size()[3:]
519
- )
520
-
521
- # ==================================
522
- # core attention computation
523
- # ==================================
524
-
525
- context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
526
-
527
- # =================
528
- # Output. [sq, b, h]
529
- # =================
530
-
531
- output = self.dense(context_layer)
532
-
533
- return output, kv_cache
534
-
535
-
536
- def _config_to_kwargs(args):
537
- common_kwargs = {
538
- "dtype": args.torch_dtype,
539
- }
540
- return common_kwargs
541
-
542
-
543
- class MLP(torch.nn.Module):
544
- """MLP.
545
-
546
- MLP will take the input with h hidden state, project it to 4*h
547
- hidden dimension, perform nonlinear transformation, and project the
548
- state back into h hidden dimension.
549
- """
550
-
551
- def __init__(self, config: ChatGLMConfig, device=None):
552
- super(MLP, self).__init__()
553
-
554
- self.add_bias = config.add_bias_linear
555
-
556
- # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
557
- self.dense_h_to_4h = nn.Linear(
558
- config.hidden_size,
559
- config.ffn_hidden_size * 2,
560
- bias=self.add_bias,
561
- device=device,
562
- **_config_to_kwargs(config)
563
- )
564
-
565
- def swiglu(x):
566
- x = torch.chunk(x, 2, dim=-1)
567
- return F.silu(x[0]) * x[1]
568
-
569
- self.activation_func = swiglu
570
-
571
- # Project back to h.
572
- self.dense_4h_to_h = nn.Linear(
573
- config.ffn_hidden_size,
574
- config.hidden_size,
575
- bias=self.add_bias,
576
- device=device,
577
- **_config_to_kwargs(config)
578
- )
579
-
580
- def forward(self, hidden_states):
581
- # [s, b, 4hp]
582
- intermediate_parallel = self.dense_h_to_4h(hidden_states)
583
- intermediate_parallel = self.activation_func(intermediate_parallel)
584
- # [s, b, h]
585
- output = self.dense_4h_to_h(intermediate_parallel)
586
- return output
587
-
588
-
589
- class GLMBlock(torch.nn.Module):
590
- """A single transformer layer.
591
-
592
- Transformer layer takes input with size [s, b, h] and returns an
593
- output of the same size.
594
- """
595
-
596
- def __init__(self, config: ChatGLMConfig, layer_number, device=None):
597
- super(GLMBlock, self).__init__()
598
- self.layer_number = layer_number
599
-
600
- self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
601
-
602
- self.fp32_residual_connection = config.fp32_residual_connection
603
-
604
- LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
605
- # Layernorm on the input data.
606
- self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
607
- dtype=config.torch_dtype)
608
-
609
- # Self attention.
610
- self.self_attention = SelfAttention(config, layer_number, device=device)
611
- self.hidden_dropout = config.hidden_dropout
612
-
613
- # Layernorm on the attention output
614
- self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
615
- dtype=config.torch_dtype)
616
-
617
- # MLP
618
- self.mlp = MLP(config, device=device)
619
-
620
- def forward(
621
- self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
622
- ):
623
- # hidden_states: [s, b, h]
624
-
625
- # Layer norm at the beginning of the transformer layer.
626
- layernorm_output = self.input_layernorm(hidden_states)
627
- # Self attention.
628
- attention_output, kv_cache = self.self_attention(
629
- layernorm_output,
630
- attention_mask,
631
- rotary_pos_emb,
632
- kv_cache=kv_cache,
633
- use_cache=use_cache
634
- )
635
-
636
- # Residual connection.
637
- if self.apply_residual_connection_post_layernorm:
638
- residual = layernorm_output
639
- else:
640
- residual = hidden_states
641
-
642
- layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
643
- layernorm_input = residual + layernorm_input
644
-
645
- # Layer norm post the self attention.
646
- layernorm_output = self.post_attention_layernorm(layernorm_input)
647
-
648
- # MLP.
649
- mlp_output = self.mlp(layernorm_output)
650
-
651
- # Second residual connection.
652
- if self.apply_residual_connection_post_layernorm:
653
- residual = layernorm_output
654
- else:
655
- residual = layernorm_input
656
-
657
- output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
658
- output = residual + output
659
-
660
- return output, kv_cache
661
-
662
-
663
- class GLMTransformer(torch.nn.Module):
664
- """Transformer class."""
665
-
666
- def __init__(self, config: ChatGLMConfig, device=None):
667
- super(GLMTransformer, self).__init__()
668
-
669
- self.fp32_residual_connection = config.fp32_residual_connection
670
- self.post_layer_norm = config.post_layer_norm
671
-
672
- # Number of layers.
673
- self.num_layers = config.num_layers
674
-
675
- # Transformer layers.
676
- def build_layer(layer_number):
677
- return GLMBlock(config, layer_number, device=device)
678
-
679
- self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
680
-
681
- if self.post_layer_norm:
682
- LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
683
- # Final layer norm before output.
684
- self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
685
- dtype=config.torch_dtype)
686
-
687
- self.gradient_checkpointing = False
688
-
689
- def _get_layer(self, layer_number):
690
- return self.layers[layer_number]
691
-
692
- def forward(
693
- self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
694
- use_cache: Optional[bool] = True,
695
- output_hidden_states: Optional[bool] = False,
696
- ):
697
- if not kv_caches:
698
- kv_caches = [None for _ in range(self.num_layers)]
699
- presents = () if use_cache else None
700
- if self.gradient_checkpointing and self.training:
701
- if use_cache:
702
- logger.warning_once(
703
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
704
- )
705
- use_cache = False
706
-
707
- all_self_attentions = None
708
- all_hidden_states = () if output_hidden_states else None
709
- for index in range(self.num_layers):
710
- if output_hidden_states:
711
- all_hidden_states = all_hidden_states + (hidden_states,)
712
-
713
- layer = self._get_layer(index)
714
- if self.gradient_checkpointing and self.training:
715
- layer_ret = torch.utils.checkpoint.checkpoint(
716
- layer,
717
- hidden_states,
718
- attention_mask,
719
- rotary_pos_emb,
720
- kv_caches[index],
721
- use_cache,
722
- use_reentrant=False
723
- )
724
- else:
725
- layer_ret = layer(
726
- hidden_states,
727
- attention_mask,
728
- rotary_pos_emb,
729
- kv_cache=kv_caches[index],
730
- use_cache=use_cache
731
- )
732
- hidden_states, kv_cache = layer_ret
733
- if use_cache:
734
- # token by token decoding, use tuple format
735
- if kv_caches[0] is not None:
736
- presents = presents + (kv_cache,)
737
- # prefilling in decoding, use tensor format to save cuda memory
738
- else:
739
- if len(presents) == 0:
740
- presents = kv_cache
741
- else:
742
- presents = torch.cat((presents, kv_cache.to(presents.device)), dim=0)
743
-
744
- if output_hidden_states:
745
- all_hidden_states = all_hidden_states + (hidden_states,)
746
-
747
- # Final layer norm.
748
- if self.post_layer_norm:
749
- hidden_states = self.final_layernorm(hidden_states)
750
-
751
- return hidden_states, presents, all_hidden_states, all_self_attentions
752
-
753
-
754
- class ChatGLMPreTrainedModel(PreTrainedModel):
755
- """
756
- An abstract class to handle weights initialization and
757
- a simple interface for downloading and loading pretrained models.
758
- """
759
-
760
- is_parallelizable = False
761
- supports_gradient_checkpointing = True
762
- config_class = ChatGLMConfig
763
- base_model_prefix = "transformer"
764
- _no_split_modules = ["GLMBlock"]
765
- _supports_flash_attn_2 = True
766
- _supports_sdpa = True
767
-
768
- def _init_weights(self, module: nn.Module):
769
- """Initialize the weights."""
770
- return
771
-
772
- def get_masks(self, input_ids, past_key_values, padding_mask=None):
773
- if self.config._attn_implementation == "flash_attention_2":
774
- if padding_mask is not None and not padding_mask.all():
775
- return padding_mask
776
- return None
777
- batch_size, seq_length = input_ids.shape
778
- full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
779
- full_attention_mask.tril_()
780
- past_length = 0
781
- if past_key_values:
782
- past_length = past_key_values[0][0].shape[2]
783
- if past_length:
784
- full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
785
- device=input_ids.device), full_attention_mask), dim=-1)
786
- if padding_mask is not None:
787
- full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
788
- if not past_length and padding_mask is not None:
789
- full_attention_mask -= padding_mask.unsqueeze(-1) - 1
790
- full_attention_mask = (full_attention_mask < 0.5).bool()
791
- full_attention_mask.unsqueeze_(1)
792
- return full_attention_mask
793
-
794
- def get_position_ids(self, input_ids, device):
795
- batch_size, seq_length = input_ids.shape
796
- position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
797
- return position_ids
798
-
799
- class Embedding(torch.nn.Module):
800
- """Language model embeddings."""
801
-
802
- def __init__(self, config: ChatGLMConfig, device=None):
803
- super(Embedding, self).__init__()
804
-
805
- self.hidden_size = config.hidden_size
806
- # Word embeddings (parallel).
807
- self.word_embeddings = nn.Embedding(
808
- config.padded_vocab_size,
809
- self.hidden_size,
810
- dtype=config.torch_dtype,
811
- device=device
812
- )
813
- self.fp32_residual_connection = config.fp32_residual_connection
814
-
815
- def forward(self, input_ids):
816
- # Embeddings.
817
- words_embeddings = self.word_embeddings(input_ids)
818
- embeddings = words_embeddings
819
- # If the input flag for fp32 residual connection is set, convert for float.
820
- if self.fp32_residual_connection:
821
- embeddings = embeddings.float()
822
- return embeddings
823
-
824
-
825
- class ChatGLMModel(ChatGLMPreTrainedModel):
826
- def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
827
- super().__init__(config)
828
- if empty_init:
829
- init_method = skip_init
830
- else:
831
- init_method = default_init
832
- init_kwargs = {}
833
- if device is not None:
834
- init_kwargs["device"] = device
835
- self.embedding = init_method(Embedding, config, **init_kwargs)
836
- self.num_layers = config.num_layers
837
- self.multi_query_group_num = config.multi_query_group_num
838
- self.kv_channels = config.kv_channels
839
-
840
- # Rotary positional embeddings
841
- self.seq_length = config.seq_length
842
- rotary_dim = (
843
- config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
844
- )
845
-
846
- self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, rope_ratio=config.rope_ratio,
847
- original_impl=config.original_rope,
848
- device=device, dtype=config.torch_dtype)
849
- self.encoder = init_method(GLMTransformer, config, **init_kwargs)
850
- self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
851
- dtype=config.torch_dtype, **init_kwargs)
852
-
853
- def get_input_embeddings(self):
854
- return self.embedding.word_embeddings
855
-
856
- def set_input_embeddings(self, value):
857
- self.embedding.word_embeddings = value
858
-
859
- def forward(
860
- self,
861
- input_ids,
862
- position_ids: Optional[torch.Tensor] = None,
863
- attention_mask: Optional[torch.BoolTensor] = None,
864
- full_attention_mask: Optional[torch.BoolTensor] = None,
865
- past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
866
- inputs_embeds: Optional[torch.Tensor] = None,
867
- use_cache: Optional[bool] = None,
868
- output_attentions: Optional[bool] = None,
869
- output_hidden_states: Optional[bool] = None,
870
- return_dict: Optional[bool] = None,
871
- ):
872
- output_hidden_states = (
873
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
874
- )
875
- use_cache = use_cache if use_cache is not None else self.config.use_cache
876
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
877
-
878
- batch_size, seq_length = input_ids.shape
879
-
880
- if inputs_embeds is None:
881
- inputs_embeds = self.embedding(input_ids)
882
-
883
- if full_attention_mask is None:
884
- if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
885
- full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
886
-
887
- # Rotary positional embeddings
888
- rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
889
- if position_ids is not None:
890
- rotary_pos_emb = rotary_pos_emb[position_ids]
891
- else:
892
- rotary_pos_emb = rotary_pos_emb[None, :seq_length]
893
-
894
- # Run encoder.
895
- hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
896
- inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
897
- kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
898
- )
899
- if presents is not None and type(presents) is torch.Tensor:
900
- presents = presents.split(1, dim=0)
901
- presents = list(presents)
902
- presents = [list(x.squeeze(0).split(1, dim=0)) for x in presents]
903
- presents = [tuple([x.squeeze(0) for x in y]) for y in presents]
904
- presents = tuple(presents)
905
-
906
- if not return_dict:
907
- return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
908
-
909
- return BaseModelOutputWithPast(
910
- last_hidden_state=hidden_states,
911
- past_key_values=presents,
912
- hidden_states=all_hidden_states,
913
- attentions=all_self_attentions,
914
- )
915
-
916
-
917
- class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
918
- def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
919
- super().__init__(config)
920
-
921
- self.max_sequence_length = config.max_length
922
- self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
923
- self.config = config
924
-
925
- def _update_model_kwargs_for_generation(
926
- self,
927
- outputs: ModelOutput,
928
- model_kwargs: Dict[str, Any],
929
- is_encoder_decoder: bool = False,
930
- standardize_cache_format: bool = False,
931
- ) -> Dict[str, Any]:
932
- # update past_key_values
933
- cache_name, cache = self._extract_past_from_model_output(
934
- outputs, standardize_cache_format=standardize_cache_format
935
- )
936
- model_kwargs[cache_name] = cache
937
-
938
- # update attention mask
939
- if "attention_mask" in model_kwargs:
940
- attention_mask = model_kwargs["attention_mask"]
941
- model_kwargs["attention_mask"] = torch.cat(
942
- [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
943
- )
944
-
945
- # update position ids
946
- if "position_ids" in model_kwargs:
947
- position_ids = model_kwargs["position_ids"]
948
- new_position_id = position_ids[..., -1:].clone()
949
- new_position_id += 1
950
- model_kwargs["position_ids"] = torch.cat(
951
- [position_ids, new_position_id], dim=-1
952
- )
953
-
954
- model_kwargs["is_first_forward"] = False
955
- return model_kwargs
956
-
957
- def prepare_inputs_for_generation(
958
- self,
959
- input_ids: torch.LongTensor,
960
- past_key_values: Optional[torch.Tensor] = None,
961
- attention_mask: Optional[torch.Tensor] = None,
962
- position_ids: Optional[torch.Tensor] = None,
963
- use_cache: Optional[bool] = None,
964
- is_first_forward: bool = True,
965
- **kwargs
966
- ) -> dict:
967
- # only last token for input_ids if past is not None
968
- if position_ids is None:
969
- position_ids = self.get_position_ids(input_ids, device=input_ids.device)
970
- if not is_first_forward:
971
- if past_key_values is not None:
972
- position_ids = position_ids[..., -1:]
973
- input_ids = input_ids[:, -1:]
974
- return {
975
- "input_ids": input_ids,
976
- "past_key_values": past_key_values,
977
- "position_ids": position_ids,
978
- "attention_mask": attention_mask,
979
- "return_last_logit": True,
980
- "use_cache": use_cache
981
- }
982
-
983
- def forward(
984
- self,
985
- input_ids: Optional[torch.Tensor] = None,
986
- position_ids: Optional[torch.Tensor] = None,
987
- attention_mask: Optional[torch.Tensor] = None,
988
- past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
989
- inputs_embeds: Optional[torch.Tensor] = None,
990
- labels: Optional[torch.Tensor] = None,
991
- use_cache: Optional[bool] = None,
992
- output_attentions: Optional[bool] = None,
993
- output_hidden_states: Optional[bool] = None,
994
- return_dict: Optional[bool] = None,
995
- return_last_logit: Optional[bool] = False,
996
- ):
997
- use_cache = use_cache if use_cache is not None else self.config.use_cache
998
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
999
-
1000
- transformer_outputs = self.transformer(
1001
- input_ids=input_ids,
1002
- position_ids=position_ids,
1003
- attention_mask=attention_mask,
1004
- past_key_values=past_key_values,
1005
- inputs_embeds=inputs_embeds,
1006
- use_cache=use_cache,
1007
- output_hidden_states=output_hidden_states,
1008
- return_dict=return_dict,
1009
- )
1010
-
1011
- hidden_states = transformer_outputs[0]
1012
- if return_last_logit:
1013
- hidden_states = hidden_states[:, -1:]
1014
- lm_logits = self.transformer.output_layer(hidden_states)
1015
-
1016
- loss = None
1017
- if labels is not None:
1018
- lm_logits = lm_logits.to(torch.float32)
1019
-
1020
- # Shift so that tokens < n predict n
1021
- shift_logits = lm_logits[..., :-1, :].contiguous()
1022
- shift_labels = labels[..., 1:].contiguous()
1023
- # Flatten the tokens
1024
- loss_fct = CrossEntropyLoss(ignore_index=-100)
1025
- loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
1026
-
1027
- lm_logits = lm_logits.to(hidden_states.dtype)
1028
- loss = loss.to(hidden_states.dtype)
1029
-
1030
- if not return_dict:
1031
- output = (lm_logits,) + transformer_outputs[1:]
1032
- return ((loss,) + output) if loss is not None else output
1033
-
1034
- return CausalLMOutputWithPast(
1035
- loss=loss,
1036
- logits=lm_logits,
1037
- past_key_values=transformer_outputs.past_key_values,
1038
- hidden_states=transformer_outputs.hidden_states,
1039
- attentions=transformer_outputs.attentions,
1040
- )
1041
-
1042
- @staticmethod
1043
- def _reorder_cache(
1044
- past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
1045
- ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
1046
- """
1047
- This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
1048
- [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
1049
- beam_idx at every generation step.
1050
-
1051
- Output shares the same memory storage as `past`.
1052
- """
1053
- return tuple(
1054
- (
1055
- layer_past[0].index_select(0, beam_idx.to(layer_past[0].device)),
1056
- layer_past[1].index_select(0, beam_idx.to(layer_past[1].device)),
1057
- )
1058
- for layer_past in past
1059
- )
1060
-
1061
- class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1062
- def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1063
- super().__init__(config)
1064
-
1065
- self.num_labels = config.num_labels
1066
- self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1067
-
1068
- self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=config.torch_dtype)
1069
- if config.classifier_dropout is not None:
1070
- self.dropout = nn.Dropout(config.classifier_dropout)
1071
- else:
1072
- self.dropout = None
1073
- self.config = config
1074
-
1075
- def forward(
1076
- self,
1077
- input_ids: Optional[torch.LongTensor] = None,
1078
- position_ids: Optional[torch.LongTensor] = None,
1079
- attention_mask: Optional[torch.Tensor] = None,
1080
- full_attention_mask: Optional[torch.Tensor] = None,
1081
- past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
1082
- inputs_embeds: Optional[torch.LongTensor] = None,
1083
- labels: Optional[torch.LongTensor] = None,
1084
- use_cache: Optional[bool] = None,
1085
- output_attentions: Optional[bool] = None,
1086
- output_hidden_states: Optional[bool] = None,
1087
- return_dict: Optional[bool] = None,
1088
- ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
1089
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1090
-
1091
- transformer_outputs = self.transformer(
1092
- input_ids=input_ids,
1093
- position_ids=position_ids,
1094
- attention_mask=attention_mask,
1095
- full_attention_mask=full_attention_mask,
1096
- past_key_values=past_key_values,
1097
- inputs_embeds=inputs_embeds,
1098
- use_cache=use_cache,
1099
- output_attentions=output_attentions,
1100
- output_hidden_states=output_hidden_states,
1101
- return_dict=return_dict,
1102
- )
1103
-
1104
- hidden_states = transformer_outputs[0]
1105
- pooled_hidden_states = hidden_states[:, -1]
1106
- if self.dropout is not None:
1107
- pooled_hidden_states = self.dropout(pooled_hidden_states)
1108
- logits = self.classifier_head(pooled_hidden_states)
1109
-
1110
- loss = None
1111
- if labels is not None:
1112
- if self.config.problem_type is None:
1113
- if self.num_labels == 1:
1114
- self.config.problem_type = "regression"
1115
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1116
- self.config.problem_type = "single_label_classification"
1117
- else:
1118
- self.config.problem_type = "multi_label_classification"
1119
-
1120
- if self.config.problem_type == "regression":
1121
- loss_fct = MSELoss()
1122
- if self.num_labels == 1:
1123
- loss = loss_fct(logits.squeeze().float(), labels.squeeze())
1124
- else:
1125
- loss = loss_fct(logits.float(), labels)
1126
- elif self.config.problem_type == "single_label_classification":
1127
- loss_fct = CrossEntropyLoss()
1128
- loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
1129
- elif self.config.problem_type == "multi_label_classification":
1130
- loss_fct = BCEWithLogitsLoss()
1131
- loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
1132
-
1133
- if not return_dict:
1134
- output = (logits,) + transformer_outputs[1:]
1135
- return ((loss,) + output) if loss is not None else output
1136
-
1137
- return SequenceClassifierOutputWithPast(
1138
- loss=loss,
1139
- logits=logits,
1140
- past_key_values=transformer_outputs.past_key_values,
1141
- hidden_states=transformer_outputs.hidden_states,
1142
- attentions=transformer_outputs.attentions,
1143
- )