IlPakoZ commited on
Commit
3e2a3e1
·
1 Parent(s): 93db9a3

Revert "Fix compatibility with transformers > 5"

Browse files

This reverts commit 93db9a3f000f5aa32e46120d31e59e3f96a0570d.

Files changed (2) hide show
  1. README.md +0 -4
  2. modeling_m5_encoder.py +2 -319
README.md CHANGED
@@ -21,10 +21,6 @@ distance-aware relative position encodings. Two classes are available:
21
  The model is pretrained on multi-task regression tasks, including quantum chemistry (QC) tasks
22
  from the [PubChemQC B3LYP/PM6 dataset](https://nakatamaho.riken.jp/pubchemqc.riken.jp/b3lyp_pm6_datasets.html).
23
 
24
- ## Requirements
25
-
26
- This model was tested and implemented with Transformers version 4.51.3, so issues might appear in other versions.
27
-
28
  ## Usage
29
 
30
  ```python
 
21
  The model is pretrained on multi-task regression tasks, including quantum chemistry (QC) tasks
22
  from the [PubChemQC B3LYP/PM6 dataset](https://nakatamaho.riken.jp/pubchemqc.riken.jp/b3lyp_pm6_datasets.html).
23
 
 
 
 
 
24
  ## Usage
25
 
26
  ```python
modeling_m5_encoder.py CHANGED
@@ -1,5 +1,3 @@
1
- import warnings
2
-
3
  import torch
4
  import numpy as np
5
  import math
@@ -9,15 +7,15 @@ from typing import Any, Optional, Union, Sequence
9
  import torch.nn as nn
10
  from transformers import PreTrainedModel, T5EncoderModel, T5ForConditionalGeneration, T5ForQuestionAnswering, T5ForTokenClassification, T5Model
11
  from torch import nn
12
- from transformers.models.t5.modeling_t5 import T5Attention, T5DenseActDense, T5DenseGatedActDense, T5ClassificationHead, T5LayerNorm, T5Block, T5LayerSelfAttention, T5LayerFF, T5PreTrainedModel
13
  from transformers.cache_utils import DynamicCache, EncoderDecoderCache
14
  from transformers.models.t5.configuration_t5 import T5Config
15
  from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutput
16
  from transformers.utils import DUMMY_INPUTS, DUMMY_MASK, is_torch_fx_proxy, is_torchdynamo_compiling
 
17
  from transformers.utils.deprecation import deprecate_kwarg
18
  from .common import M5Pooler
19
  from .prepare_data import get_positional_encodings_and_align
20
- from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
21
 
22
  logger = logging.getLogger(__name__)
23
 
@@ -274,321 +272,6 @@ class M5EncoderModel(T5EncoderModel):
274
 
275
  return encoder_outputs
276
 
277
- class T5Stack(T5PreTrainedModel):
278
- def __init__(self, config, embed_tokens=None):
279
- super().__init__(config)
280
-
281
- self.embed_tokens = embed_tokens
282
- self.is_decoder = config.is_decoder
283
-
284
- self.block = nn.ModuleList(
285
- [T5Block(config, has_relative_attention_bias=bool(i == 0), layer_idx=i) for i in range(config.num_layers)]
286
- )
287
- self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
288
- self.dropout = nn.Dropout(config.dropout_rate)
289
-
290
- # Initialize weights and apply final processing
291
- self.post_init()
292
- # Model parallel
293
- self.model_parallel = False
294
- self.device_map = None
295
- self.gradient_checkpointing = False
296
-
297
- def parallelize(self, device_map=None):
298
- warnings.warn(
299
- "`T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model"
300
- " with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
301
- " `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0,"
302
- " 'block.1': 1, ...}",
303
- FutureWarning,
304
- )
305
- # Check validity of device_map
306
- self.device_map = (
307
- get_device_map(len(self.block), range(torch.cuda.device_count())) if device_map is None else device_map
308
- )
309
- assert_device_map(self.device_map, len(self.block))
310
- self.model_parallel = True
311
- self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
312
- self.last_device = "cuda:" + str(max(self.device_map.keys()))
313
- # Load onto devices
314
- for k, v in self.device_map.items():
315
- for layer in v:
316
- cuda_device = "cuda:" + str(k)
317
- self.block[layer] = self.block[layer].to(cuda_device)
318
-
319
- # Set embed_tokens to first layer
320
- self.embed_tokens = self.embed_tokens.to(self.first_device)
321
- # Set final layer norm to last device
322
- self.final_layer_norm = self.final_layer_norm.to(self.last_device)
323
-
324
- def deparallelize(self):
325
- warnings.warn(
326
- "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
327
- FutureWarning,
328
- )
329
- self.model_parallel = False
330
- self.device_map = None
331
- self.first_device = "cpu"
332
- self.last_device = "cpu"
333
- for i in range(len(self.block)):
334
- self.block[i] = self.block[i].to("cpu")
335
- self.embed_tokens = self.embed_tokens.to("cpu")
336
- self.final_layer_norm = self.final_layer_norm.to("cpu")
337
- torch.cuda.empty_cache()
338
-
339
- def get_input_embeddings(self):
340
- return self.embed_tokens
341
-
342
- def set_input_embeddings(self, new_embeddings):
343
- self.embed_tokens = new_embeddings
344
-
345
- def forward(
346
- self,
347
- input_ids=None,
348
- attention_mask=None,
349
- encoder_hidden_states=None,
350
- encoder_attention_mask=None,
351
- inputs_embeds=None,
352
- head_mask=None,
353
- cross_attn_head_mask=None,
354
- past_key_values=None,
355
- use_cache=None,
356
- output_attentions=None,
357
- output_hidden_states=None,
358
- return_dict=None,
359
- cache_position=None,
360
- ):
361
- # Model parallel
362
- if self.model_parallel:
363
- torch.cuda.set_device(self.first_device)
364
- self.embed_tokens = self.embed_tokens.to(self.first_device)
365
- use_cache = use_cache if use_cache is not None else self.config.use_cache
366
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
367
- output_hidden_states = (
368
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
369
- )
370
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
371
-
372
- if input_ids is not None and inputs_embeds is not None:
373
- err_msg_prefix = "decoder_" if self.is_decoder else ""
374
- raise ValueError(
375
- f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
376
- )
377
- elif input_ids is not None:
378
- input_shape = input_ids.size()
379
- input_ids = input_ids.view(-1, input_shape[-1])
380
- elif inputs_embeds is not None:
381
- input_shape = inputs_embeds.size()[:-1]
382
- else:
383
- err_msg_prefix = "decoder_" if self.is_decoder else ""
384
- raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
385
-
386
- if self.gradient_checkpointing and self.training:
387
- if use_cache:
388
- logger.warning_once(
389
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
390
- )
391
- use_cache = False
392
-
393
- if inputs_embeds is None:
394
- if self.embed_tokens is None:
395
- raise ValueError("You have to initialize the model with valid token embeddings")
396
- inputs_embeds = self.embed_tokens(input_ids)
397
-
398
- batch_size, seq_length = input_shape
399
-
400
- if use_cache is True:
401
- if not self.is_decoder:
402
- raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
403
-
404
- # initialize past_key_values
405
- return_legacy_cache = False
406
- return_self_attention_cache = False
407
- if self.is_decoder and (use_cache or past_key_values is not None):
408
- if isinstance(past_key_values, Cache) and not isinstance(past_key_values, EncoderDecoderCache):
409
- return_self_attention_cache = True
410
- past_key_values = EncoderDecoderCache(past_key_values, DynamicCache())
411
- elif not isinstance(past_key_values, EncoderDecoderCache):
412
- return_legacy_cache = True
413
- logger.warning_once(
414
- "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. "
415
- "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
416
- "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
417
- )
418
- past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
419
- elif past_key_values is None:
420
- past_key_values = EncoderDecoderCache(DynamicCache(), DynamicCache())
421
- elif not self.is_decoder:
422
- # do not pass cache object down the line for encoder stack
423
- # it messes indexing later in decoder-stack because cache object is modified in-place
424
- past_key_values = None
425
-
426
- past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
427
- if cache_position is None:
428
- cache_position = torch.arange(
429
- past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
430
- )
431
-
432
- if attention_mask is None and not is_torchdynamo_compiling():
433
- # required mask seq length can be calculated via length of past cache
434
- mask_seq_length = past_key_values_length + seq_length
435
- attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
436
-
437
- if self.config.is_decoder:
438
- causal_mask = self._update_causal_mask(
439
- attention_mask,
440
- inputs_embeds,
441
- cache_position,
442
- past_key_values.self_attention_cache if past_key_values is not None else None,
443
- output_attentions,
444
- )
445
- elif attention_mask is not None:
446
- causal_mask = attention_mask[:, None, None, :]
447
- causal_mask = causal_mask.to(dtype=inputs_embeds.dtype)
448
- causal_mask = (1.0 - causal_mask) * torch.finfo(inputs_embeds.dtype).min
449
- else:
450
- causal_mask = None
451
-
452
- # If a 2D or 3D attention mask is provided for the cross-attention
453
- # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
454
- if self.is_decoder and encoder_hidden_states is not None:
455
- encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
456
- encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
457
- if encoder_attention_mask is None:
458
- encoder_attention_mask = torch.ones(
459
- encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
460
- )
461
- encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
462
- else:
463
- encoder_extended_attention_mask = None
464
-
465
- # Prepare head mask if needed
466
- head_mask = self.get_head_mask(head_mask, self.config.num_layers)
467
- cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
468
- all_hidden_states = () if output_hidden_states else None
469
- all_attentions = () if output_attentions else None
470
- all_cross_attentions = () if (output_attentions and self.is_decoder) else None
471
- position_bias = None
472
- encoder_decoder_position_bias = None
473
-
474
- hidden_states = self.dropout(inputs_embeds)
475
-
476
- for i, layer_module in enumerate(self.block):
477
- layer_head_mask = head_mask[i]
478
- cross_attn_layer_head_mask = cross_attn_head_mask[i]
479
- # Model parallel
480
- if self.model_parallel:
481
- torch.cuda.set_device(hidden_states.device)
482
- # Ensure that attention_mask is always on the same device as hidden_states
483
- if causal_mask is not None:
484
- causal_mask = causal_mask.to(hidden_states.device)
485
- if position_bias is not None:
486
- position_bias = position_bias.to(hidden_states.device)
487
- if encoder_hidden_states is not None:
488
- encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
489
- if encoder_extended_attention_mask is not None:
490
- encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
491
- if encoder_decoder_position_bias is not None:
492
- encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
493
- if layer_head_mask is not None:
494
- layer_head_mask = layer_head_mask.to(hidden_states.device)
495
- if cross_attn_layer_head_mask is not None:
496
- cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
497
- if output_hidden_states:
498
- all_hidden_states = all_hidden_states + (hidden_states,)
499
-
500
- if self.gradient_checkpointing and self.training:
501
- layer_outputs = self._gradient_checkpointing_func(
502
- layer_module.forward,
503
- hidden_states,
504
- causal_mask,
505
- position_bias,
506
- encoder_hidden_states,
507
- encoder_extended_attention_mask,
508
- encoder_decoder_position_bias,
509
- layer_head_mask,
510
- cross_attn_layer_head_mask,
511
- None, # past_key_value is always None with gradient checkpointing
512
- use_cache,
513
- output_attentions,
514
- return_dict,
515
- cache_position,
516
- )
517
- else:
518
- layer_outputs = layer_module(
519
- hidden_states,
520
- attention_mask=causal_mask,
521
- position_bias=position_bias,
522
- encoder_hidden_states=encoder_hidden_states,
523
- encoder_attention_mask=encoder_extended_attention_mask,
524
- encoder_decoder_position_bias=encoder_decoder_position_bias,
525
- layer_head_mask=layer_head_mask,
526
- cross_attn_layer_head_mask=cross_attn_layer_head_mask,
527
- past_key_value=past_key_values,
528
- use_cache=use_cache,
529
- output_attentions=output_attentions,
530
- return_dict=return_dict,
531
- cache_position=cache_position,
532
- )
533
-
534
- # layer_outputs is a tuple with:
535
- # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
536
- if use_cache is False:
537
- layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
538
-
539
- hidden_states, next_decoder_cache = layer_outputs[:2]
540
-
541
- # We share the position biases between the layers - the first layer store them
542
- # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
543
- # (cross-attention position bias), (cross-attention weights)
544
- position_bias = layer_outputs[2]
545
- if self.is_decoder and encoder_hidden_states is not None:
546
- encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
547
-
548
- if output_attentions:
549
- all_attentions = all_attentions + (layer_outputs[3],)
550
- if self.is_decoder:
551
- all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
552
-
553
- # Model Parallel: If it's the last layer for that device, put things on the next device
554
- if self.model_parallel:
555
- for k, v in self.device_map.items():
556
- if i == v[-1] and "cuda:" + str(k) != self.last_device:
557
- hidden_states = hidden_states.to("cuda:" + str(k + 1))
558
-
559
- hidden_states = self.final_layer_norm(hidden_states)
560
- hidden_states = self.dropout(hidden_states)
561
-
562
- # Add last layer
563
- if output_hidden_states:
564
- all_hidden_states = all_hidden_states + (hidden_states,)
565
-
566
- next_cache = next_decoder_cache if use_cache else None
567
- if return_self_attention_cache:
568
- next_cache = past_key_values.self_attention_cache
569
- if return_legacy_cache:
570
- next_cache = past_key_values.to_legacy_cache()
571
-
572
- if not return_dict:
573
- return tuple(
574
- v
575
- for v in [
576
- hidden_states,
577
- next_cache,
578
- all_hidden_states,
579
- all_attentions,
580
- all_cross_attentions,
581
- ]
582
- if v is not None
583
- )
584
- return BaseModelOutputWithPastAndCrossAttentions(
585
- last_hidden_state=hidden_states,
586
- past_key_values=next_cache,
587
- hidden_states=all_hidden_states,
588
- attentions=all_attentions,
589
- cross_attentions=all_cross_attentions,
590
- )
591
-
592
  class M5Stack(T5Stack):
593
  def __init__(self, config, embed_tokens=None):
594
  super().__init__(config, embed_tokens)
 
 
 
1
  import torch
2
  import numpy as np
3
  import math
 
7
  import torch.nn as nn
8
  from transformers import PreTrainedModel, T5EncoderModel, T5ForConditionalGeneration, T5ForQuestionAnswering, T5ForTokenClassification, T5Model
9
  from torch import nn
10
+ from transformers.models.t5.modeling_t5 import T5Attention, T5DenseActDense, T5DenseGatedActDense, T5ClassificationHead, T5LayerNorm, T5Stack, T5Block, T5LayerSelfAttention, T5LayerFF
11
  from transformers.cache_utils import DynamicCache, EncoderDecoderCache
12
  from transformers.models.t5.configuration_t5 import T5Config
13
  from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutput
14
  from transformers.utils import DUMMY_INPUTS, DUMMY_MASK, is_torch_fx_proxy, is_torchdynamo_compiling
15
+ from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutput
16
  from transformers.utils.deprecation import deprecate_kwarg
17
  from .common import M5Pooler
18
  from .prepare_data import get_positional_encodings_and_align
 
19
 
20
  logger = logging.getLogger(__name__)
21
 
 
272
 
273
  return encoder_outputs
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  class M5Stack(T5Stack):
276
  def __init__(self, config, embed_tokens=None):
277
  super().__init__(config, embed_tokens)