Adorg commited on
Commit
69d1977
·
1 Parent(s): 9e3186d

Upload modelling_medicap.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. modelling_medicap.py +303 -0
modelling_medicap.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Any, Optional, Tuple, Union
3
+
4
+ import torch
5
+ import transformers
6
+ from torch.nn import CrossEntropyLoss
7
+ from transformers import PreTrainedTokenizerFast, VisionEncoderDecoderModel
8
+ from transformers.configuration_utils import PretrainedConfig
9
+ from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput
10
+ from transformers.modeling_utils import PreTrainedModel
11
+ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import \
12
+ VisionEncoderDecoderConfig
13
+ from transformers.utils import logging
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+
18
+ class CvtWithProjectionHeadConfig(transformers.CvtConfig):
19
+ def __init__(self, projection_size: int = None, **kwargs: Any) -> None:
20
+ super().__init__(**kwargs)
21
+ self.projection_size = projection_size
22
+
23
+
24
+ class ModelOutputWithProjectionEmbedding(transformers.modeling_outputs.ModelOutput):
25
+ last_hidden_state: torch.FloatTensor
26
+
27
+
28
+ class CvtProjectionHead(torch.nn.Module):
29
+
30
+ def __init__(self, config) -> None:
31
+ super().__init__()
32
+
33
+ # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/models/cvt/modeling_cvt.py#L657
34
+ self.layer_norm = torch.nn.LayerNorm(config.embed_dim[-1], eps=config.layer_norm_eps)
35
+
36
+ # No bias as following layer normalisation with bias:
37
+ self.projection = torch.nn.Linear(config.embed_dim[-1], config.projection_size, bias=False)
38
+
39
+
40
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
41
+ x = self.layer_norm(x)
42
+ x = self.projection(x)
43
+ return x
44
+
45
+
46
+ class CvtWithProjectionHead(transformers.CvtPreTrainedModel):
47
+ def __init__(self, config):
48
+ super().__init__(config)
49
+
50
+ self.cvt = transformers.CvtModel(config, add_pooling_layer=False)
51
+ self.projection_head = CvtProjectionHead(config)
52
+
53
+ # Initialize weights and apply final processing:
54
+ self.post_init()
55
+
56
+ def forward(
57
+ self,
58
+ pixel_values: Optional[torch.Tensor] = None,
59
+ output_hidden_states: Optional[bool] = None,
60
+ return_dict: Optional[bool] = None,
61
+ ) -> Union[Tuple, ModelOutputWithProjectionEmbedding]:
62
+
63
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
64
+
65
+ outputs = self.cvt(
66
+ pixel_values,
67
+ output_hidden_states=output_hidden_states,
68
+ return_dict=return_dict,
69
+ )
70
+
71
+ projection = self.projection_head(
72
+ torch.permute(torch.flatten(outputs.last_hidden_state, 2), [0, 2, 1]),
73
+ )
74
+
75
+ if not return_dict:
76
+ return projection
77
+
78
+ return ModelOutputWithProjectionEmbedding(
79
+ last_hidden_state=projection,
80
+ )
81
+
82
+
83
+ class MedICapEncoderDecoderModel(VisionEncoderDecoderModel):
84
+
85
+ config_class = VisionEncoderDecoderConfig
86
+ base_model_prefix = "vision_encoder_decoder"
87
+ main_input_name = "pixel_values"
88
+ supports_gradient_checkpointing = True
89
+
90
+ def __init__(
91
+ self,
92
+ config: Optional[PretrainedConfig] = None,
93
+ encoder: Optional[PreTrainedModel] = None,
94
+ decoder: Optional[PreTrainedModel] = None,
95
+ ):
96
+
97
+ if decoder:
98
+ assert not decoder.config.add_cross_attention, '"add_cross_attention" must be False for the given decoder'
99
+ assert decoder.config.is_decoder, '"is_decoder" must be True for the given decoder'
100
+
101
+ if config is None and (encoder is None or decoder is None):
102
+ raise ValueError("Either a configuration or an encoder and a decoder has to be provided.")
103
+ if config is None:
104
+ config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config)
105
+ else:
106
+ if not isinstance(config, self.config_class):
107
+ raise ValueError(f"Config: {config} has to be of type {self.config_class}")
108
+
109
+ config.tie_word_embeddings = False
110
+
111
+ # initialize with config
112
+ PreTrainedModel.__init__(self, config)
113
+
114
+ # Encoder:
115
+ if encoder is None:
116
+ encoder = CvtWithProjectionHead(config=config.encoder)
117
+
118
+ # Decoder:
119
+ if decoder is None:
120
+ decoder = transformers.GPT2LMHeadModel(config=config.decoder)
121
+
122
+ self.encoder = encoder
123
+ self.decoder = decoder
124
+
125
+ if self.encoder.config.to_dict() != self.config.encoder.to_dict():
126
+ logger.warning(
127
+ f"Config of the encoder: {self.encoder.__class__} is overwritten by shared encoder config:"
128
+ f" {self.config.encoder}"
129
+ )
130
+ if self.decoder.config.to_dict() != self.config.decoder.to_dict():
131
+ logger.warning(
132
+ f"Config of the decoder: {self.decoder.__class__} is overwritten by shared decoder config:"
133
+ f" {self.config.decoder}"
134
+ )
135
+
136
+ self.encoder.config = self.config.encoder
137
+ self.decoder.config = self.config.decoder
138
+
139
+ def forward(
140
+ self,
141
+ pixel_values: Optional[torch.FloatTensor] = None,
142
+ decoder_input_ids: Optional[torch.LongTensor] = None,
143
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
144
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
145
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
146
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
147
+ labels: Optional[torch.LongTensor] = None,
148
+ use_cache: Optional[bool] = None,
149
+ output_attentions: Optional[bool] = None,
150
+ output_hidden_states: Optional[bool] = None,
151
+ return_dict: Optional[bool] = None,
152
+ **kwargs,
153
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
154
+
155
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
156
+
157
+ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
158
+
159
+ kwargs_decoder = {
160
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
161
+ }
162
+
163
+ if decoder_inputs_embeds is None:
164
+ decoder_inputs_embeds = self.decoder.transformer.wte(decoder_input_ids)
165
+
166
+ if encoder_outputs is None:
167
+ if pixel_values is None:
168
+ raise ValueError("You have to specify pixel_values")
169
+
170
+ encoder_outputs = self.encoder(
171
+ pixel_values,
172
+ output_hidden_states=output_hidden_states,
173
+ return_dict=return_dict,
174
+ **kwargs_encoder,
175
+ ) # CvT does not support output_attentions.
176
+ decoder_inputs_embeds = torch.cat([encoder_outputs[0], decoder_inputs_embeds], dim=1)
177
+ if decoder_attention_mask is not None:
178
+ decoder_attention_mask = torch.cat(
179
+ [
180
+ torch.ones(encoder_outputs[0].shape[:-1], dtype=decoder_attention_mask.dtype, device=self.device),
181
+ decoder_attention_mask
182
+ ],
183
+ dim=1,
184
+ )
185
+
186
+ decoder_outputs = self.decoder(
187
+ attention_mask=decoder_attention_mask,
188
+ inputs_embeds=decoder_inputs_embeds,
189
+ output_attentions=output_attentions,
190
+ output_hidden_states=output_hidden_states,
191
+ use_cache=use_cache,
192
+ past_key_values=past_key_values,
193
+ return_dict=return_dict,
194
+ **kwargs_decoder,
195
+ )
196
+
197
+ # Loss:
198
+ loss = None
199
+ if labels is not None:
200
+ logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
201
+ loss_fct = CrossEntropyLoss()
202
+ loss = loss_fct(logits.reshape(-1, self.decoder.config.vocab_size), labels.reshape(-1))
203
+
204
+ if not return_dict:
205
+ if loss is not None:
206
+ return (loss,) + decoder_outputs + encoder_outputs
207
+ else:
208
+ return decoder_outputs + encoder_outputs
209
+
210
+ return Seq2SeqLMOutput(
211
+ loss=loss,
212
+ logits=decoder_outputs.logits,
213
+ past_key_values=decoder_outputs.past_key_values,
214
+ decoder_hidden_states=decoder_outputs.hidden_states,
215
+ decoder_attentions=decoder_outputs.attentions,
216
+ cross_attentions=decoder_outputs.cross_attentions,
217
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
218
+ )
219
+
220
+ def prepare_inputs_for_generation(
221
+ self,
222
+ input_ids,
223
+ past_key_values=None,
224
+ attention_mask=None,
225
+ use_cache=None,
226
+ encoder_outputs=None,
227
+ **kwargs,
228
+ ):
229
+ """
230
+ Modification of:
231
+ https://github.com/huggingface/transformers/blob/main/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py#L660
232
+
233
+ This can help with managing input_embeds and input_ids:
234
+ https://github.com/huggingface/transformers/issues/6535
235
+ """
236
+ input_dict = {'use_cache': use_cache, 'encoder_outputs': encoder_outputs, 'attention_mask': attention_mask}
237
+
238
+ if past_key_values is None:
239
+ decoder_inputs = self.decoder.prepare_inputs_for_generation(
240
+ input_ids, inputs_embeds=encoder_outputs[0], past_key_values=past_key_values,
241
+ )
242
+ input_dict['decoder_inputs_embeds'] = decoder_inputs['inputs_embeds']
243
+ else:
244
+ decoder_inputs = self.decoder.prepare_inputs_for_generation(
245
+ input_ids, past_key_values=past_key_values,
246
+ )
247
+ input_dict['decoder_input_ids'] = decoder_inputs['input_ids']
248
+ input_dict['past_key_values'] = decoder_inputs['past_key_values']
249
+ input_dict['decoder_attention_mask'] = decoder_inputs['attention_mask'] if 'attention_mask' in decoder_inputs else None
250
+
251
+ return input_dict
252
+
253
+ def tokenize_captions_teacher_forcing(
254
+ self,
255
+ captions: str,
256
+ tokenizer: PreTrainedTokenizerFast,
257
+ max_len: int,
258
+ ):
259
+ """
260
+ Tokenizes the captions and creates the inputs and targets for teacher forcing.
261
+
262
+ Argument/s:
263
+ captions - the captions.
264
+ tokenizer - Hugging Face tokenizer.
265
+ max_len - maximum number of tokens.
266
+
267
+ Returns:
268
+ batch_dict = {
269
+ decoder_input_ids - the token identifiers for the input of the decoder.
270
+ decoder_attention_mask - the attention mask for the decoder_input_ids.
271
+ decoder_token_type_ids - the token type identifiers for the decoder_input_ids.
272
+ label_ids - the label token identifiers for the decoder.
273
+ }
274
+ """
275
+
276
+ # Prepare the caption for the tokenizer by placing the special tokens:
277
+ caption = [f'{tokenizer.bos_token}{i}{tokenizer.eos_token}' for i in captions]
278
+
279
+ # Tokenize the caption:
280
+ tokenized = tokenizer(
281
+ caption,
282
+ padding='longest',
283
+ truncation=True,
284
+ max_length=max_len + 1, # +1 to account for the shift between input and target.
285
+ return_tensors='pt',
286
+ return_token_type_ids=False,
287
+ add_special_tokens=False, # Done in prepare_sections_for_tokenizer()
288
+ ).to(self.device)
289
+
290
+ # Modify for language modelling:
291
+ batch_dict = {
292
+
293
+ # Labels for the decoder (shifted right by one for autoregression):
294
+ 'label_ids': tokenized['input_ids'][:, 1:].detach().clone(),
295
+
296
+ # Remove last token identifier to match the sequence length of the labels:
297
+ 'decoder_input_ids': tokenized['input_ids'][:, :-1],
298
+
299
+ # Attention mask for the decoder_input_ids (remove first token so that the eos_token_id is not considered):
300
+ 'decoder_attention_mask': tokenized['attention_mask'][:, 1:],
301
+ }
302
+
303
+ return batch_dict