Leonardo6 commited on
Commit
7cd3bbb
·
verified ·
1 Parent(s): 76c519d

Update modeling_vlm.py

Browse files
Files changed (1) hide show
  1. modeling_vlm.py +30 -5
modeling_vlm.py CHANGED
@@ -3,7 +3,7 @@ from typing import Any, override
3
  import torch
4
  import torch.nn as nn
5
  from torch import FloatTensor, LongTensor, Tensor
6
- from transformers import AutoModel, PreTrainedModel, LlamaForCausalLM, LlamaModel
7
 
8
  from .configuration_vlm import VLMConfig
9
  from .connectors import Connector, connector_map
@@ -11,6 +11,7 @@ from .connectors import Connector, connector_map
11
 
12
  class VLM(LlamaModel):
13
  config_class = VLMConfig
 
14
  @override
15
  def __init__(self, config):
16
  super().__init__(config)
@@ -41,6 +42,7 @@ class VLM(LlamaModel):
41
 
42
  class VLMForCausalLM(LlamaForCausalLM):
43
  config_class = VLMConfig
 
44
  @override
45
  def __init__(self, config):
46
  super().__init__(config)
@@ -132,7 +134,7 @@ class VLMForCausalLM(LlamaForCausalLM):
132
  ):
133
  images = kwargs.pop("images", None)
134
  image_sizes = kwargs.pop("image_sizes", None)
135
- inputs = super(self.__class__, self).prepare_inputs_for_generation(
136
  input_ids,
137
  past_key_values=past_key_values,
138
  inputs_embeds=inputs_embeds,
@@ -145,8 +147,31 @@ class VLMForCausalLM(LlamaForCausalLM):
145
  inputs["image_sizes"] = image_sizes
146
  return inputs
147
 
148
- def encode_images(self: Any, images: Tensor) -> tuple[Tensor, ...]:
149
- image_features = self.model.vision_model(images)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  image_features = self.model.connector(image_features)
151
 
152
  return image_features
@@ -398,4 +423,4 @@ class VLMForCausalLM(LlamaForCausalLM):
398
  return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
399
 
400
 
401
- AutoModel.register(VLMConfig, VLMForCausalLM)
 
3
  import torch
4
  import torch.nn as nn
5
  from torch import FloatTensor, LongTensor, Tensor
6
+ from transformers import AutoModel, LlamaForCausalLM, LlamaModel, PreTrainedModel
7
 
8
  from .configuration_vlm import VLMConfig
9
  from .connectors import Connector, connector_map
 
11
 
12
  class VLM(LlamaModel):
13
  config_class = VLMConfig
14
+
15
  @override
16
  def __init__(self, config):
17
  super().__init__(config)
 
42
 
43
  class VLMForCausalLM(LlamaForCausalLM):
44
  config_class = VLMConfig
45
+
46
  @override
47
  def __init__(self, config):
48
  super().__init__(config)
 
134
  ):
135
  images = kwargs.pop("images", None)
136
  image_sizes = kwargs.pop("image_sizes", None)
137
+ inputs = super().prepare_inputs_for_generation(
138
  input_ids,
139
  past_key_values=past_key_values,
140
  inputs_embeds=inputs_embeds,
 
147
  inputs["image_sizes"] = image_sizes
148
  return inputs
149
 
150
+ def encode_images(self: Any, images: list[Tensor] | Tensor) -> list[Tensor] | Tensor:
151
+ if type(images) is list:
152
+ image_features: list[Tensor] | Tensor = []
153
+ for image in images:
154
+ outputs = self.model.vision_model(
155
+ image.unsqueeze(0),
156
+ output_hidden_states=True,
157
+ )
158
+ hidden_states: Tensor = outputs.hidden_states[self.output_layer].to(image.dtype)
159
+ if not self.config.vision_config.use_cls_token:
160
+ image_features.append(hidden_states[:, 1:])
161
+ else:
162
+ image_features.append(hidden_states)
163
+ else:
164
+ outputs = self.model.vision_model(
165
+ images,
166
+ output_hidden_states=True,
167
+ )
168
+ hidden_states = outputs.hidden_states[self.config.vision_config.output_layer].to(
169
+ images.dtype
170
+ )
171
+ if not self.config.vision_config.use_cls_token:
172
+ image_features = hidden_states[:, 1:]
173
+ else:
174
+ image_features = hidden_states
175
  image_features = self.model.connector(image_features)
176
 
177
  return image_features
 
423
  return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
424
 
425
 
426
+ AutoModel.register(VLMConfig, VLMForCausalLM)