bugfix
Browse files- modeling_motif.py +0 -40
modeling_motif.py
CHANGED
|
@@ -98,18 +98,15 @@ ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
|
|
| 98 |
class MotifRotaryEmbeddingWithCache(nn.Module):
|
| 99 |
"""
|
| 100 |
Rotary positional embedding module with caching for efficiency.
|
| 101 |
-
|
| 102 |
Args:
|
| 103 |
dim (int): Dimensionality of the embedding.
|
| 104 |
max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
|
| 105 |
base (int): Base for computing inverse frequency. Default is 10000.
|
| 106 |
device (torch.device, optional): Device for tensor storage.
|
| 107 |
-
|
| 108 |
Methods:
|
| 109 |
forward(x, seq_len=None):
|
| 110 |
Computes cosine and sine embeddings for input sequence length.
|
| 111 |
Automatically updates cache if `seq_len` exceeds cached length.
|
| 112 |
-
|
| 113 |
Attributes:
|
| 114 |
inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
|
| 115 |
cos_cached (torch.Tensor): Cached cosine embeddings.
|
|
@@ -241,10 +238,8 @@ class MotifRotaryEmbedding(nn.Module):
|
|
| 241 |
def rotate_half(x):
|
| 242 |
"""
|
| 243 |
Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
|
| 244 |
-
|
| 245 |
Args:
|
| 246 |
x (torch.Tensor): The input tensor.
|
| 247 |
-
|
| 248 |
Returns:
|
| 249 |
torch.Tensor: A tensor where the latter half of the dimensions are negated
|
| 250 |
and moved before the first half.
|
|
@@ -259,7 +254,6 @@ def rotate_half(x):
|
|
| 259 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
|
| 260 |
"""
|
| 261 |
Applies rotary position embeddings to the input tensors.
|
| 262 |
-
|
| 263 |
Args:
|
| 264 |
q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
|
| 265 |
k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
|
|
@@ -270,7 +264,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
|
|
| 270 |
fused_rope (bool, optional): If True, applies fused rotary embeddings using
|
| 271 |
`moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
|
| 272 |
Defaults to False.
|
| 273 |
-
|
| 274 |
Returns:
|
| 275 |
Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
|
| 276 |
"""
|
|
@@ -322,26 +315,21 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
| 322 |
class MotifAttention(nn.Module):
|
| 323 |
"""
|
| 324 |
Differential Attention (DiffAttention) module.
|
| 325 |
-
|
| 326 |
Implements the Differential Attention from
|
| 327 |
"DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
|
| 328 |
-
|
| 329 |
Overview
|
| 330 |
Standard transformers often over-allocate attention to irrelevant context.
|
| 331 |
DiffAttention addresses this by computing attention as the difference between
|
| 332 |
two separate softmax attention maps, effectively canceling noise and promoting
|
| 333 |
sparse, structured attention patterns.
|
| 334 |
-
|
| 335 |
Reference Implementation
|
| 336 |
https://github.com/microsoft/unilm/tree/master/Diff-Transformer
|
| 337 |
-
|
| 338 |
Args
|
| 339 |
The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
|
| 340 |
λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
|
| 341 |
- lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
|
| 342 |
- lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
|
| 343 |
- lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
|
| 344 |
-
|
| 345 |
"""
|
| 346 |
|
| 347 |
def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
|
|
@@ -964,11 +952,9 @@ MOTIF_START_DOCSTRING = r"""
|
|
| 964 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 965 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 966 |
etc.)
|
| 967 |
-
|
| 968 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
| 969 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
| 970 |
and behavior.
|
| 971 |
-
|
| 972 |
Parameters:
|
| 973 |
config ([`MotifConfig`]):
|
| 974 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
@@ -1049,51 +1035,39 @@ MOTIF_INPUTS_DOCSTRING = r"""
|
|
| 1049 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1050 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| 1051 |
it.
|
| 1052 |
-
|
| 1053 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1054 |
[`PreTrainedTokenizer.__call__`] for details.
|
| 1055 |
-
|
| 1056 |
[What are input IDs?](../glossary#input-ids)
|
| 1057 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1058 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
| 1059 |
-
|
| 1060 |
- 1 for tokens that are **not masked**,
|
| 1061 |
- 0 for tokens that are **masked**.
|
| 1062 |
-
|
| 1063 |
[What are attention masks?](../glossary#attention-mask)
|
| 1064 |
-
|
| 1065 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1066 |
[`PreTrainedTokenizer.__call__`] for details.
|
| 1067 |
-
|
| 1068 |
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
|
| 1069 |
`past_key_values`).
|
| 1070 |
-
|
| 1071 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
| 1072 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
| 1073 |
information on the default strategy.
|
| 1074 |
-
|
| 1075 |
- 1 indicates the head is **not masked**,
|
| 1076 |
- 0 indicates the head is **masked**.
|
| 1077 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1078 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
| 1079 |
config.n_positions - 1]`.
|
| 1080 |
-
|
| 1081 |
[What are position IDs?](../glossary#position-ids)
|
| 1082 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
| 1083 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
| 1084 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
| 1085 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
| 1086 |
-
|
| 1087 |
Two formats are allowed:
|
| 1088 |
- a [`~cache_utils.Cache`] instance, see our
|
| 1089 |
[kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
|
| 1090 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
| 1091 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
| 1092 |
cache format.
|
| 1093 |
-
|
| 1094 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
| 1095 |
legacy cache format will be returned.
|
| 1096 |
-
|
| 1097 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
| 1098 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
| 1099 |
of shape `(batch_size, sequence_length)`.
|
|
@@ -1126,7 +1100,6 @@ MOTIF_INPUTS_DOCSTRING = r"""
|
|
| 1126 |
class MotifModel(MotifPreTrainedModel):
|
| 1127 |
"""
|
| 1128 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
|
| 1129 |
-
|
| 1130 |
Args:
|
| 1131 |
config: MotifConfig
|
| 1132 |
"""
|
|
@@ -1375,7 +1348,6 @@ class MotifModel(MotifPreTrainedModel):
|
|
| 1375 |
"""
|
| 1376 |
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
| 1377 |
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
| 1378 |
-
|
| 1379 |
Args:
|
| 1380 |
attention_mask (`torch.Tensor`):
|
| 1381 |
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
|
@@ -1434,11 +1406,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
|
|
| 1434 |
self.multi_token_heads = config.multi_token_heads
|
| 1435 |
|
| 1436 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 1437 |
-
else:
|
| 1438 |
-
self.tokenwise_last_layers = nn.ModuleList(
|
| 1439 |
-
[MotifDecoderLayer(config, config.num_hidden_layers - 1) for _ in range(self.multi_token_heads)])
|
| 1440 |
-
self.tokenwise_lm_heads = nn.ModuleList(
|
| 1441 |
-
[nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(self.multi_token_heads)])
|
| 1442 |
|
| 1443 |
# Initialize weights and apply final processing
|
| 1444 |
self.post_init()
|
|
@@ -1490,25 +1457,18 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
|
|
| 1490 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
| 1491 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
| 1492 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
| 1493 |
-
|
| 1494 |
num_logits_to_keep (`int`, *optional*):
|
| 1495 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
| 1496 |
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
| 1497 |
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
| 1498 |
-
|
| 1499 |
Returns:
|
| 1500 |
-
|
| 1501 |
Example:
|
| 1502 |
-
|
| 1503 |
```python
|
| 1504 |
>>> from transformers import AutoTokenizer, MotifForCausalLM
|
| 1505 |
-
|
| 1506 |
>>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
|
| 1507 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
|
| 1508 |
-
|
| 1509 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
| 1510 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
| 1511 |
-
|
| 1512 |
>>> # Generate
|
| 1513 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
| 1514 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
|
| 98 |
class MotifRotaryEmbeddingWithCache(nn.Module):
|
| 99 |
"""
|
| 100 |
Rotary positional embedding module with caching for efficiency.
|
|
|
|
| 101 |
Args:
|
| 102 |
dim (int): Dimensionality of the embedding.
|
| 103 |
max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
|
| 104 |
base (int): Base for computing inverse frequency. Default is 10000.
|
| 105 |
device (torch.device, optional): Device for tensor storage.
|
|
|
|
| 106 |
Methods:
|
| 107 |
forward(x, seq_len=None):
|
| 108 |
Computes cosine and sine embeddings for input sequence length.
|
| 109 |
Automatically updates cache if `seq_len` exceeds cached length.
|
|
|
|
| 110 |
Attributes:
|
| 111 |
inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
|
| 112 |
cos_cached (torch.Tensor): Cached cosine embeddings.
|
|
|
|
| 238 |
def rotate_half(x):
|
| 239 |
"""
|
| 240 |
Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
|
|
|
|
| 241 |
Args:
|
| 242 |
x (torch.Tensor): The input tensor.
|
|
|
|
| 243 |
Returns:
|
| 244 |
torch.Tensor: A tensor where the latter half of the dimensions are negated
|
| 245 |
and moved before the first half.
|
|
|
|
| 254 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
|
| 255 |
"""
|
| 256 |
Applies rotary position embeddings to the input tensors.
|
|
|
|
| 257 |
Args:
|
| 258 |
q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
|
| 259 |
k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
|
|
|
|
| 264 |
fused_rope (bool, optional): If True, applies fused rotary embeddings using
|
| 265 |
`moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
|
| 266 |
Defaults to False.
|
|
|
|
| 267 |
Returns:
|
| 268 |
Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
|
| 269 |
"""
|
|
|
|
| 315 |
class MotifAttention(nn.Module):
|
| 316 |
"""
|
| 317 |
Differential Attention (DiffAttention) module.
|
|
|
|
| 318 |
Implements the Differential Attention from
|
| 319 |
"DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
|
|
|
|
| 320 |
Overview
|
| 321 |
Standard transformers often over-allocate attention to irrelevant context.
|
| 322 |
DiffAttention addresses this by computing attention as the difference between
|
| 323 |
two separate softmax attention maps, effectively canceling noise and promoting
|
| 324 |
sparse, structured attention patterns.
|
|
|
|
| 325 |
Reference Implementation
|
| 326 |
https://github.com/microsoft/unilm/tree/master/Diff-Transformer
|
|
|
|
| 327 |
Args
|
| 328 |
The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
|
| 329 |
λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
|
| 330 |
- lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
|
| 331 |
- lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
|
| 332 |
- lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
|
|
|
|
| 333 |
"""
|
| 334 |
|
| 335 |
def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
|
|
|
|
| 952 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 953 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 954 |
etc.)
|
|
|
|
| 955 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
| 956 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
| 957 |
and behavior.
|
|
|
|
| 958 |
Parameters:
|
| 959 |
config ([`MotifConfig`]):
|
| 960 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
|
|
| 1035 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 1036 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| 1037 |
it.
|
|
|
|
| 1038 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1039 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
|
| 1040 |
[What are input IDs?](../glossary#input-ids)
|
| 1041 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1042 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
|
| 1043 |
- 1 for tokens that are **not masked**,
|
| 1044 |
- 0 for tokens that are **masked**.
|
|
|
|
| 1045 |
[What are attention masks?](../glossary#attention-mask)
|
|
|
|
| 1046 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 1047 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
|
| 1048 |
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
|
| 1049 |
`past_key_values`).
|
|
|
|
| 1050 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
| 1051 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
| 1052 |
information on the default strategy.
|
|
|
|
| 1053 |
- 1 indicates the head is **not masked**,
|
| 1054 |
- 0 indicates the head is **masked**.
|
| 1055 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 1056 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
| 1057 |
config.n_positions - 1]`.
|
|
|
|
| 1058 |
[What are position IDs?](../glossary#position-ids)
|
| 1059 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
| 1060 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
| 1061 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
| 1062 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
|
|
| 1063 |
Two formats are allowed:
|
| 1064 |
- a [`~cache_utils.Cache`] instance, see our
|
| 1065 |
[kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
|
| 1066 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
| 1067 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
| 1068 |
cache format.
|
|
|
|
| 1069 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
| 1070 |
legacy cache format will be returned.
|
|
|
|
| 1071 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
| 1072 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
| 1073 |
of shape `(batch_size, sequence_length)`.
|
|
|
|
| 1100 |
class MotifModel(MotifPreTrainedModel):
|
| 1101 |
"""
|
| 1102 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
|
|
|
|
| 1103 |
Args:
|
| 1104 |
config: MotifConfig
|
| 1105 |
"""
|
|
|
|
| 1348 |
"""
|
| 1349 |
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
| 1350 |
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
|
|
| 1351 |
Args:
|
| 1352 |
attention_mask (`torch.Tensor`):
|
| 1353 |
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
|
|
|
| 1406 |
self.multi_token_heads = config.multi_token_heads
|
| 1407 |
|
| 1408 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1409 |
|
| 1410 |
# Initialize weights and apply final processing
|
| 1411 |
self.post_init()
|
|
|
|
| 1457 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
| 1458 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
| 1459 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
|
|
| 1460 |
num_logits_to_keep (`int`, *optional*):
|
| 1461 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
| 1462 |
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
| 1463 |
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
|
|
|
| 1464 |
Returns:
|
|
|
|
| 1465 |
Example:
|
|
|
|
| 1466 |
```python
|
| 1467 |
>>> from transformers import AutoTokenizer, MotifForCausalLM
|
|
|
|
| 1468 |
>>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
|
| 1469 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
|
|
|
|
| 1470 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
| 1471 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
|
| 1472 |
>>> # Generate
|
| 1473 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
| 1474 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|