Update modeling_vlm.py
Browse files- modeling_vlm.py +30 -5
modeling_vlm.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Any, override
|
|
| 3 |
import torch
|
| 4 |
import torch.nn as nn
|
| 5 |
from torch import FloatTensor, LongTensor, Tensor
|
| 6 |
-
from transformers import AutoModel,
|
| 7 |
|
| 8 |
from .configuration_vlm import VLMConfig
|
| 9 |
from .connectors import Connector, connector_map
|
|
@@ -11,6 +11,7 @@ from .connectors import Connector, connector_map
|
|
| 11 |
|
| 12 |
class VLM(LlamaModel):
|
| 13 |
config_class = VLMConfig
|
|
|
|
| 14 |
@override
|
| 15 |
def __init__(self, config):
|
| 16 |
super().__init__(config)
|
|
@@ -41,6 +42,7 @@ class VLM(LlamaModel):
|
|
| 41 |
|
| 42 |
class VLMForCausalLM(LlamaForCausalLM):
|
| 43 |
config_class = VLMConfig
|
|
|
|
| 44 |
@override
|
| 45 |
def __init__(self, config):
|
| 46 |
super().__init__(config)
|
|
@@ -132,7 +134,7 @@ class VLMForCausalLM(LlamaForCausalLM):
|
|
| 132 |
):
|
| 133 |
images = kwargs.pop("images", None)
|
| 134 |
image_sizes = kwargs.pop("image_sizes", None)
|
| 135 |
-
inputs = super(
|
| 136 |
input_ids,
|
| 137 |
past_key_values=past_key_values,
|
| 138 |
inputs_embeds=inputs_embeds,
|
|
@@ -145,8 +147,31 @@ class VLMForCausalLM(LlamaForCausalLM):
|
|
| 145 |
inputs["image_sizes"] = image_sizes
|
| 146 |
return inputs
|
| 147 |
|
| 148 |
-
def encode_images(self: Any, images: Tensor) ->
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
image_features = self.model.connector(image_features)
|
| 151 |
|
| 152 |
return image_features
|
|
@@ -398,4 +423,4 @@ class VLMForCausalLM(LlamaForCausalLM):
|
|
| 398 |
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
| 399 |
|
| 400 |
|
| 401 |
-
AutoModel.register(VLMConfig, VLMForCausalLM)
|
|
|
|
| 3 |
import torch
|
| 4 |
import torch.nn as nn
|
| 5 |
from torch import FloatTensor, LongTensor, Tensor
|
| 6 |
+
from transformers import AutoModel, LlamaForCausalLM, LlamaModel, PreTrainedModel
|
| 7 |
|
| 8 |
from .configuration_vlm import VLMConfig
|
| 9 |
from .connectors import Connector, connector_map
|
|
|
|
| 11 |
|
| 12 |
class VLM(LlamaModel):
|
| 13 |
config_class = VLMConfig
|
| 14 |
+
|
| 15 |
@override
|
| 16 |
def __init__(self, config):
|
| 17 |
super().__init__(config)
|
|
|
|
| 42 |
|
| 43 |
class VLMForCausalLM(LlamaForCausalLM):
|
| 44 |
config_class = VLMConfig
|
| 45 |
+
|
| 46 |
@override
|
| 47 |
def __init__(self, config):
|
| 48 |
super().__init__(config)
|
|
|
|
| 134 |
):
|
| 135 |
images = kwargs.pop("images", None)
|
| 136 |
image_sizes = kwargs.pop("image_sizes", None)
|
| 137 |
+
inputs = super().prepare_inputs_for_generation(
|
| 138 |
input_ids,
|
| 139 |
past_key_values=past_key_values,
|
| 140 |
inputs_embeds=inputs_embeds,
|
|
|
|
| 147 |
inputs["image_sizes"] = image_sizes
|
| 148 |
return inputs
|
| 149 |
|
| 150 |
+
def encode_images(self: Any, images: list[Tensor] | Tensor) -> list[Tensor] | Tensor:
|
| 151 |
+
if type(images) is list:
|
| 152 |
+
image_features: list[Tensor] | Tensor = []
|
| 153 |
+
for image in images:
|
| 154 |
+
outputs = self.model.vision_model(
|
| 155 |
+
image.unsqueeze(0),
|
| 156 |
+
output_hidden_states=True,
|
| 157 |
+
)
|
| 158 |
+
hidden_states: Tensor = outputs.hidden_states[self.output_layer].to(image.dtype)
|
| 159 |
+
if not self.config.vision_config.use_cls_token:
|
| 160 |
+
image_features.append(hidden_states[:, 1:])
|
| 161 |
+
else:
|
| 162 |
+
image_features.append(hidden_states)
|
| 163 |
+
else:
|
| 164 |
+
outputs = self.model.vision_model(
|
| 165 |
+
images,
|
| 166 |
+
output_hidden_states=True,
|
| 167 |
+
)
|
| 168 |
+
hidden_states = outputs.hidden_states[self.config.vision_config.output_layer].to(
|
| 169 |
+
images.dtype
|
| 170 |
+
)
|
| 171 |
+
if not self.config.vision_config.use_cls_token:
|
| 172 |
+
image_features = hidden_states[:, 1:]
|
| 173 |
+
else:
|
| 174 |
+
image_features = hidden_states
|
| 175 |
image_features = self.model.connector(image_features)
|
| 176 |
|
| 177 |
return image_features
|
|
|
|
| 423 |
return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
|
| 424 |
|
| 425 |
|
| 426 |
+
AutoModel.register(VLMConfig, VLMForCausalLM)
|