SafaaAI commited on
Commit
7d8f0c7
·
verified ·
1 Parent(s): 4b3ef56

Upload 2 files

Browse files
Files changed (2) hide show
  1. configuration.py +116 -0
  2. modeling_tinyllava_phi.py +624 -0
configuration.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+ from transformers import CONFIG_MAPPING
3
+ from transformers import AutoConfig
4
+
5
+ IGNORE_INDEX = -100
6
+ IMAGE_TOKEN_INDEX = -200
7
+ DEFAULT_IMAGE_TOKEN = "<image>"
8
+
9
+
10
+ class TinyLlavaConfig(PretrainedConfig):
11
+
12
+ model_type = "tinyllava"
13
+ def __init__(
14
+ self,
15
+ llm_model_name_or_path = '',
16
+ tokenizer_name_or_path = None,
17
+ vision_model_name_or_path = '',
18
+ vision_model_name_or_path2 = '',
19
+ connector_type = None,
20
+ text_config=None,
21
+ hidden_size=2048,
22
+ vocab_size=32000,
23
+ ignore_index=-100,
24
+ image_token_index=32000,
25
+ pad_token = None,
26
+ pad_token_id = None,
27
+ tokenizer_padding_side = 'right',
28
+ tokenizer_model_max_length = 2048,
29
+ vision_config = None,
30
+ vision_hidden_size = None,
31
+ vision_feature_layer = -2,
32
+ vision_feature_select_strategy = 'patch',
33
+ image_aspect_ratio = 'square',
34
+ resampler_hidden_size = None,
35
+ num_queries = None,
36
+ num_resampler_layers = None,
37
+ use_cache = False,
38
+ cache_dir = None,
39
+ tokenizer_use_fast = False,
40
+ tune_type_llm = 'frozen',
41
+ tune_type_connector = 'frozen',
42
+ tune_type_vision_tower = 'frozen',
43
+ tune_vision_tower_from_layer = -1,
44
+
45
+ **kwargs
46
+
47
+ ):
48
+ self.llm_model_name_or_path = llm_model_name_or_path
49
+ self.tokenizer_name_or_path = tokenizer_name_or_path or self.llm_model_name_or_path
50
+ self.vision_model_name_or_path = vision_model_name_or_path
51
+ self.vision_model_name_or_path2 = vision_model_name_or_path2
52
+ self.connector_type = connector_type
53
+ self.tune_type_llm = tune_type_llm
54
+ self.tune_type_connector = tune_type_connector
55
+ self.tune_type_vision_tower = tune_type_vision_tower
56
+ self.tune_vision_tower_from_layer = tune_vision_tower_from_layer
57
+
58
+ self.ignore_index = IGNORE_INDEX
59
+ self.image_token_index = IMAGE_TOKEN_INDEX
60
+ self.pad_token = pad_token
61
+ self.pad_token_id = pad_token_id
62
+ self.tokenizer_padding_side = tokenizer_padding_side
63
+ self.tokenizer_model_max_length = tokenizer_model_max_length
64
+ self.vision_feature_layer = vision_feature_layer
65
+ self.vision_feature_select_strategy = vision_feature_select_strategy
66
+ self.image_aspect_ratio = image_aspect_ratio
67
+ self.resampler_hidden_size = resampler_hidden_size
68
+ self.num_queries = num_queries
69
+ self.num_resampler_layers = num_resampler_layers
70
+ self.use_cache = use_cache
71
+ self.cache_dir = cache_dir
72
+ self.tokenizer_use_fast = tokenizer_use_fast
73
+ self._load_text_config(text_config)
74
+ self._load_vision_config(vision_config)
75
+
76
+ super().__init__(**kwargs)
77
+
78
+
79
+ def _load_text_config(self, text_config=None):
80
+ if self.llm_model_name_or_path is None or self.llm_model_name_or_path == '':
81
+ self.text_config = CONFIG_MAPPING['llama']()
82
+
83
+ else:
84
+ self.text_config = AutoConfig.from_pretrained(self.llm_model_name_or_path, trust_remote_code=True)
85
+ if text_config is not None:
86
+ self.text_config = self.text_config.from_dict(text_config)
87
+
88
+ self.hidden_size = getattr(self.text_config, 'hidden_size', getattr(self.text_config, 'model_dim', None))
89
+ self.vocab_size = getattr(self.text_config, 'vocab_size', None)
90
+
91
+
92
+
93
+ def _load_vision_config(self, vision_config=None):
94
+ if self.vision_model_name_or_path is None or self.vision_model_name_or_path == '':
95
+ self.vision_config = CONFIG_MAPPING['clip_vision_model'](
96
+ intermediate_size=4096,
97
+ hidden_size=1024,
98
+ patch_size=14,
99
+ image_size=336,
100
+ num_hidden_layers=24,
101
+ num_attention_heads=16,
102
+ vocab_size=32000,
103
+ projection_dim=768,
104
+ )
105
+
106
+ else:
107
+ self.vision_config = AutoConfig.from_pretrained(self.vision_model_name_or_path.split(':')[-1])
108
+ self.vision_config = getattr(self.vision_config, 'vision_config', self.vision_config)
109
+ if vision_config is not None:
110
+ self.vision_config = self.vision_config.from_dict(vision_config)
111
+
112
+ self.vision_config.model_name_or_path = self.vision_model_name_or_path.split(':')[-1]
113
+ self.vision_config.model_name_or_path2 = self.vision_model_name_or_path2.split(':')[-1]
114
+ self.vision_hidden_size = getattr(self.vision_config, 'hidden_size', None)
115
+
116
+
modeling_tinyllava_phi.py ADDED
@@ -0,0 +1,624 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For licensing see accompanying LICENSE file.
2
+ # Copyright (C) 2024 TinyLLaVA. All Rights Reserved.
3
+ import time
4
+
5
+ import dataclasses
6
+ from enum import auto, Enum
7
+ from typing import List, Tuple, Optional, Union
8
+ import requests
9
+ from PIL import Image
10
+ from io import BytesIO
11
+ import base64
12
+ import re
13
+
14
+ import torch
15
+ import torch.utils.checkpoint
16
+ from torch import nn
17
+ from torch.nn import functional as F
18
+
19
+ from transformers.utils import logging
20
+ from transformers import PreTrainedModel
21
+ from transformers.modeling_outputs import CausalLMOutputWithPast
22
+ from transformers.generation.utils import GenerateOutput
23
+ from transformers import CLIPVisionModel, CLIPImageProcessor, SiglipVisionModel, SiglipImageProcessor
24
+
25
+ from .configuration import TinyLlavaConfig, IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
26
+
27
+ from transformers import AutoConfig, AutoModelForCausalLM, PhiForCausalLM
28
+
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ # Model Constants
34
+ IGNORE_INDEX = -100
35
+ IMAGE_TOKEN_INDEX = -200
36
+ DEFAULT_IMAGE_TOKEN = "<image>"
37
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
38
+ DEFAULT_IM_START_TOKEN = "<im_start>"
39
+ DEFAULT_IM_END_TOKEN = "<im_end>"
40
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
41
+
42
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
43
+ WORKER_HEART_BEAT_INTERVAL = 15
44
+ LOGDIR = "."
45
+
46
+
47
+ class SeparatorStyle(Enum):
48
+ """Different separator style."""
49
+ SINGLE = auto()
50
+ TWO = auto()
51
+ MPT = auto()
52
+ PLAIN = auto()
53
+ LLAMA_2 = auto()
54
+ TINY_LLAMA = auto()
55
+ QWEN_2 = auto()
56
+
57
+
58
+ @dataclasses.dataclass
59
+ class Conversation:
60
+ """A class that keeps all conversation history."""
61
+ system: str
62
+ roles: List[str]
63
+ messages: List[List[str]]
64
+ offset: int
65
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
66
+ sep: str = "###"
67
+ sep2: str = None
68
+ version: str = "Unknown"
69
+
70
+ skip_next: bool = False
71
+
72
+ def get_prompt(self):
73
+ messages = self.messages
74
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
75
+ messages = self.messages.copy()
76
+ init_role, init_msg = messages[0].copy()
77
+ init_msg = init_msg[0].replace("<image>", "").strip()
78
+ if 'mmtag' in self.version:
79
+ messages[0] = (init_role, init_msg)
80
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
81
+ messages.insert(1, (self.roles[1], "Received."))
82
+ else:
83
+ messages[0] = (init_role, "<image>\n" + init_msg)
84
+
85
+ if self.sep_style == SeparatorStyle.TWO:
86
+ seps = [self.sep, self.sep2]
87
+ ret = self.system + seps[0]
88
+ for i, (role, message) in enumerate(messages):
89
+ if message:
90
+ if type(message) is tuple:
91
+ message, _, _ = message
92
+ ret += role + ": " + message + seps[i % 2]
93
+ else:
94
+ ret += role + ":"
95
+ else:
96
+ raise ValueError(f"Invalid style: {self.sep_style}")
97
+
98
+ return ret
99
+
100
+ def append_message(self, role, message):
101
+ self.messages.append([role, message])
102
+
103
+ def copy(self):
104
+ return Conversation(
105
+ system=self.system,
106
+ roles=self.roles,
107
+ messages=[[x, y] for x, y in self.messages],
108
+ offset=self.offset,
109
+ sep_style=self.sep_style,
110
+ sep=self.sep,
111
+ sep2=self.sep2,
112
+ version=self.version)
113
+
114
+
115
+
116
+
117
+ conv_phi_v0 = Conversation(
118
+ system="A chat between a curious user and an artificial intelligence assistant. "
119
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
120
+ roles=("USER", "ASSISTANT"),
121
+ version="phi",
122
+ messages=(),
123
+ offset=0,
124
+ sep_style=SeparatorStyle.TWO,
125
+ sep=" ",
126
+ sep2="<|endoftext|>",
127
+ )
128
+
129
+
130
+ def load_image_from_base64(image):
131
+ return Image.open(BytesIO(base64.b64decode(image)))
132
+
133
+
134
+ def expand2square(pil_img, background_color):
135
+ width, height = pil_img.size
136
+ if width == height:
137
+ return pil_img
138
+ elif width > height:
139
+ result = Image.new(pil_img.mode, (width, width), background_color)
140
+ result.paste(pil_img, (0, (width - height) // 2))
141
+ return result
142
+ else:
143
+ result = Image.new(pil_img.mode, (height, height), background_color)
144
+ result.paste(pil_img, ((height - width) // 2, 0))
145
+ return result
146
+
147
+
148
+ def process_images(images, image_processor, model_cfg):
149
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
150
+ new_images = []
151
+ if image_aspect_ratio == 'pad':
152
+ for image in images:
153
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
154
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
155
+ new_images.append(image)
156
+ else:
157
+ return image_processor(images, return_tensors='pt')['pixel_values']
158
+ if all(x.shape == new_images[0].shape for x in new_images):
159
+ new_images = torch.stack(new_images, dim=0)
160
+ return new_images
161
+
162
+
163
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
164
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
165
+
166
+ def insert_separator(X, sep):
167
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
168
+
169
+ input_ids = []
170
+ offset = 0
171
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
172
+ offset = 1
173
+ input_ids.append(prompt_chunks[0][0])
174
+
175
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
176
+ input_ids.extend(x[offset:])
177
+
178
+ if return_tensors is not None:
179
+ if return_tensors == 'pt':
180
+ return torch.tensor(input_ids, dtype=torch.long)
181
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
182
+ return input_ids
183
+
184
+ def load_image(image_file):
185
+ if image_file.startswith("http") or image_file.startswith("https"):
186
+ response = requests.get(image_file)
187
+ image = Image.open(BytesIO(response.content)).convert("RGB")
188
+ else:
189
+ image = Image.open(image_file).convert("RGB")
190
+ return image
191
+
192
+ ACT_TYPE = {
193
+ 'relu': nn.ReLU,
194
+ 'gelu': nn.GELU
195
+ }
196
+
197
+ class Connector(nn.Module):
198
+ def __init__(self, config=None):
199
+ super().__init__()
200
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', config.connector_type)
201
+ act_type = config.connector_type.split('_')[-1]
202
+ mlp_depth = int(mlp_gelu_match.group(1))
203
+ modules = [nn.Linear(config.vision_hidden_size, config.hidden_size)]
204
+ for _ in range(1, mlp_depth):
205
+ modules.append(ACT_TYPE[act_type]())
206
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
207
+
208
+ self._connector = nn.Sequential(*modules)
209
+
210
+ def forward(self, x):
211
+ return self._connector(x)
212
+
213
+ class VisionTower(nn.Module):
214
+ def __init__(self, cfg, model_name_or_path = 'clip'):
215
+ super().__init__()
216
+ if 'clip' in model_name_or_path:
217
+ self._vision_tower = CLIPVisionModel(cfg)
218
+ self._image_processor = CLIPImageProcessor.from_pretrained(cfg.model_name_or_path)
219
+ else:
220
+ self._vision_tower = SiglipVisionModel(cfg)
221
+ self._image_processor = SiglipImageProcessor.from_pretrained(cfg.model_name_or_path)
222
+
223
+ self.config = cfg
224
+
225
+ def forward(self, x, **kwargs):
226
+ image_features = self._vision_tower(x, output_hidden_states=True)
227
+ image_features = image_features.hidden_states[kwargs.get('vision_feature_layer', -2)]
228
+
229
+ if kwargs.get('vision_feature_select_strategy', 'patch') == 'patch':
230
+ image_features = image_features[:, 1:]
231
+ elif kwargs.get('vision_feature_select_strategy', 'patch') == 'cls_patch':
232
+ image_features = image_features
233
+ else:
234
+ raise ValueError(f"Unexpected select feature: {kwargs.get('vision_feature_select_strategy')}")
235
+
236
+ return image_features
237
+
238
+ @property
239
+ def vision_tower(self):
240
+ return self._vision_tower
241
+
242
+ @vision_tower.setter
243
+ def vision_tower(self, vision_tower):
244
+ self._vision_tower = vision_tower
245
+
246
+ def get_value_from_kwargs(kwargs, name):
247
+ if name in kwargs:
248
+ return kwargs.pop(name)
249
+ else:
250
+ return None
251
+
252
+
253
+ class TinyLlavaPreTrainedModel(PreTrainedModel):
254
+ config_class = TinyLlavaConfig
255
+ base_model_prefix = "model"
256
+ supports_gradient_checkpointing = True
257
+ _no_split_modules = ["LlavaVisionAttention"]
258
+ _skip_keys_device_placement = "past_key_values"
259
+ _supports_flash_attn_2 = True
260
+
261
+ def _init_weights(self, module):
262
+ std = (
263
+ self.config.initializer_range
264
+ if hasattr(self.config, "initializer_range")
265
+ else self.config.text_config.initializer_range
266
+ )
267
+
268
+ if hasattr(module, "class_embedding"):
269
+ module.class_embedding.data.normal_(mean=0.0, std=std)
270
+
271
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
272
+ module.weight.data.normal_(mean=0.0, std=std)
273
+ if module.bias is not None:
274
+ module.bias.data.zero_()
275
+ elif isinstance(module, nn.Embedding):
276
+ module.weight.data.normal_(mean=0.0, std=std)
277
+ if module.padding_idx is not None:
278
+ module.weight.data[module.padding_idx].zero_()
279
+
280
+ @property
281
+ def _supports_sdpa(self):
282
+ return self.language_model._supports_sdpa
283
+
284
+
285
+ class TinyLlavaForConditionalGeneration(TinyLlavaPreTrainedModel):
286
+ def __init__(self, config: TinyLlavaConfig):
287
+
288
+ super().__init__(config)
289
+
290
+ self.language_model = PhiForCausalLM(config.text_config)
291
+ self.vision_tower = VisionTower(config.vision_config, config.vision_model_name_or_path)
292
+ self.connector = Connector(config)
293
+ self.post_init()
294
+
295
+
296
+ def get_input_embeddings(self):
297
+ return self.language_model.get_input_embeddings()
298
+
299
+ def set_input_embeddings(self, value):
300
+ self.language_model.set_input_embeddings(value)
301
+
302
+ def get_output_embeddings(self):
303
+ return self.language_model.get_output_embeddings()
304
+
305
+ def set_output_embeddings(self, new_embeddings):
306
+ self.language_model.set_output_embeddings(new_embeddings)
307
+
308
+ def set_decoder(self, decoder):
309
+ self.language_model.set_decoder(decoder)
310
+
311
+ def get_decoder(self):
312
+ return self.language_model.get_decoder()
313
+
314
+ def tie_weights(self):
315
+ return self.language_model.tie_weights()
316
+
317
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
318
+ model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
319
+ # update vocab size
320
+ self.config.text_config.vocab_size = model_embeds.num_embeddings
321
+ self.config.vocab_size = model_embeds.num_embeddings
322
+ self.vocab_size = model_embeds.num_embeddings
323
+ return model_embeds
324
+
325
+
326
+ def forward(
327
+ self,
328
+ input_ids: torch.LongTensor = None,
329
+ attention_mask: Optional[torch.Tensor] = None,
330
+ position_ids: Optional[torch.LongTensor] = None,
331
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
332
+ inputs_embeds: Optional[torch.FloatTensor] = None,
333
+ labels: Optional[torch.LongTensor] = None,
334
+ use_cache: Optional[bool] = None,
335
+ output_attentions: Optional[bool] = None,
336
+ output_hidden_states: Optional[bool] = None,
337
+ images: Optional[torch.FloatTensor] = None,
338
+ image_sizes: Optional[List[List[int]]] = None,
339
+ return_dict: Optional[bool] = None,
340
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
341
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
342
+ if inputs_embeds is None:
343
+ (
344
+ input_ids,
345
+ position_ids,
346
+ attention_mask,
347
+ past_key_values,
348
+ inputs_embeds,
349
+ labels
350
+ ) = self.prepare_inputs_labels_for_multimodal(
351
+ input_ids,
352
+ position_ids,
353
+ attention_mask,
354
+ past_key_values,
355
+ labels,
356
+ images,
357
+ image_sizes
358
+ )
359
+ return self.language_model.forward(
360
+ input_ids=input_ids,
361
+ attention_mask=attention_mask,
362
+ position_ids=position_ids,
363
+ past_key_values=past_key_values,
364
+ inputs_embeds=inputs_embeds,
365
+ labels=labels,
366
+ use_cache=use_cache,
367
+ output_attentions=output_attentions,
368
+ output_hidden_states=output_hidden_states,
369
+ return_dict=return_dict
370
+ )
371
+
372
+ @torch.no_grad()
373
+ def generate(
374
+ self,
375
+ inputs: Optional[torch.Tensor] = None,
376
+ images: Optional[torch.Tensor] = None,
377
+ image_sizes: Optional[torch.Tensor] = None,
378
+ **kwargs,
379
+ ) -> Union[GenerateOutput, torch.LongTensor]:
380
+ position_ids = kwargs.pop("position_ids", None)
381
+ attention_mask = kwargs.pop("attention_mask", None)
382
+ if "inputs_embeds" in kwargs:
383
+ raise NotImplementedError("`inputs_embeds` is not supported")
384
+
385
+ if images is not None:
386
+ (
387
+ inputs,
388
+ position_ids,
389
+ attention_mask,
390
+ _,
391
+ inputs_embeds,
392
+ _
393
+ ) = self.prepare_inputs_labels_for_multimodal(
394
+ inputs,
395
+ position_ids,
396
+ attention_mask,
397
+ None,
398
+ None,
399
+ images,
400
+ image_sizes=image_sizes
401
+ )
402
+ else:
403
+ inputs_embeds = self.language_model.get_input_embeddings()(inputs)
404
+
405
+ return self.language_model.generate(
406
+ position_ids=position_ids,
407
+ attention_mask=attention_mask,
408
+ inputs_embeds=inputs_embeds,
409
+ **kwargs
410
+ )
411
+
412
+ def encode_images(self, images):
413
+ kwargs = {}
414
+ kwargs['vision_feature_layer'] = self.config.vision_feature_layer
415
+ kwargs['vision_feature_select_strategy'] = self.config.vision_feature_select_strategy
416
+ images = images.to(device=self.device, dtype=self.dtype)
417
+ image_features = self.vision_tower(images, **kwargs)
418
+ image_features = self.connector(image_features)
419
+ return image_features
420
+
421
+
422
+
423
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
424
+ inputs_embeds=None, **kwargs):
425
+ images = kwargs.pop("images", None)
426
+ image_sizes = kwargs.pop("image_sizes", None)
427
+ inputs = self.language_model.prepare_inputs_for_generation(
428
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
429
+ )
430
+ if images is not None:
431
+ inputs['images'] = images
432
+ if image_sizes is not None:
433
+ inputs['image_sizes'] = image_sizes
434
+ return inputs
435
+
436
+ def prepare_inputs_labels_for_multimodal(
437
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
438
+ images, image_sizes=None
439
+ ):
440
+ vision_tower = self.vision_tower
441
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
442
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels
443
+
444
+
445
+ image_features = self.encode_images(images)
446
+
447
+ # TODO: image start / end is not implemented here to support pretraining.
448
+ if getattr(self.config, 'tune_mm_mlp_adapter', False):
449
+ raise NotImplementedError
450
+
451
+ # Let's just add dummy tensors if they do not exist,
452
+ # it is a headache to deal with None all the time.
453
+ # But it is not ideal, and if you have a better idea,
454
+ # please open an issue / submit a PR, thanks.
455
+ _labels = labels
456
+ _position_ids = position_ids
457
+ _attention_mask = attention_mask
458
+ if attention_mask is None:
459
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
460
+ else:
461
+ attention_mask = attention_mask.bool()
462
+ if position_ids is None:
463
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
464
+ if labels is None:
465
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
466
+
467
+ # remove the padding using attention_mask -- FIXME
468
+ _input_ids = input_ids
469
+ input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
470
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
471
+
472
+ new_input_embeds = []
473
+ new_labels = []
474
+ cur_image_idx = 0
475
+ for batch_idx, cur_input_ids in enumerate(input_ids):
476
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
477
+ if num_images == 0:
478
+ cur_image_features = image_features[cur_image_idx]
479
+ cur_input_embeds_1 = self.language_model.get_input_embeddings()(cur_input_ids)
480
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
481
+ new_input_embeds.append(cur_input_embeds)
482
+ new_labels.append(labels[batch_idx])
483
+ cur_image_idx += 1
484
+ continue
485
+
486
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
487
+ cur_input_ids_noim = []
488
+ cur_labels = labels[batch_idx]
489
+ cur_labels_noim = []
490
+ for i in range(len(image_token_indices) - 1):
491
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
492
+ cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
493
+ split_sizes = [x.shape[0] for x in cur_labels_noim]
494
+ cur_input_embeds = self.language_model.get_input_embeddings()(torch.cat(cur_input_ids_noim))
495
+ cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
496
+ cur_new_input_embeds = []
497
+ cur_new_labels = []
498
+
499
+ for i in range(num_images + 1):
500
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
501
+ cur_new_labels.append(cur_labels_noim[i])
502
+ if i < num_images:
503
+ cur_image_features = image_features[cur_image_idx]
504
+ cur_image_idx += 1
505
+ cur_new_input_embeds.append(cur_image_features)
506
+ cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
507
+
508
+ cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
509
+
510
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
511
+ cur_new_labels = torch.cat(cur_new_labels)
512
+
513
+ new_input_embeds.append(cur_new_input_embeds)
514
+ new_labels.append(cur_new_labels)
515
+
516
+ # Truncate sequences to max length as image embeddings can make the sequence longer
517
+ tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
518
+ if tokenizer_model_max_length is not None:
519
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
520
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
521
+
522
+ # Combine them
523
+ max_len = max(x.shape[0] for x in new_input_embeds)
524
+ batch_size = len(new_input_embeds)
525
+
526
+ new_input_embeds_padded = []
527
+ new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
528
+ attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
529
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
530
+
531
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
532
+ cur_len = cur_new_embed.shape[0]
533
+ if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
534
+ new_input_embeds_padded.append(torch.cat((
535
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
536
+ cur_new_embed
537
+ ), dim=0))
538
+ if cur_len > 0:
539
+ new_labels_padded[i, -cur_len:] = cur_new_labels
540
+ attention_mask[i, -cur_len:] = True
541
+ position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
542
+ else:
543
+ new_input_embeds_padded.append(torch.cat((
544
+ cur_new_embed,
545
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
546
+ ), dim=0))
547
+ if cur_len > 0:
548
+ new_labels_padded[i, :cur_len] = cur_new_labels
549
+ attention_mask[i, :cur_len] = True
550
+ position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
551
+
552
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
553
+
554
+ if _labels is None:
555
+ new_labels = None
556
+ else:
557
+ new_labels = new_labels_padded
558
+
559
+ if _attention_mask is None:
560
+ attention_mask = None
561
+ else:
562
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
563
+
564
+ if _position_ids is None:
565
+ position_ids = None
566
+
567
+ return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
568
+
569
+ def chat(
570
+ self,
571
+ prompt: str,
572
+ tokenizer = None,
573
+ image: str = None,
574
+ max_new_tokens: int = 512,
575
+ num_beams = 1,
576
+ top_p=None,
577
+ temperature=0
578
+ ):
579
+ image_processor = self.vision_tower._image_processor
580
+
581
+ if image is not None:
582
+ prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
583
+ conv = conv_phi_v0.copy()
584
+ conv.append_message(conv.roles[0], prompt)
585
+ conv.append_message(conv.roles[1], None)
586
+ prompt = conv.get_prompt()
587
+ if image is not None:
588
+ image = load_image(image)
589
+ image_tensor = process_images(image, image_processor, self.config).to(self.device)
590
+
591
+ input_ids = (
592
+ tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
593
+ .unsqueeze(0).to(self.device)
594
+ )
595
+ # Generate
596
+ stime = time.time()
597
+
598
+ with torch.inference_mode():
599
+ output_ids = self.generate(
600
+ input_ids,
601
+ images=image_tensor,
602
+ do_sample=True if temperature > 0 else False,
603
+ temperature=temperature,
604
+ top_p=top_p,
605
+ num_beams=num_beams,
606
+ pad_token_id=tokenizer.pad_token_id,
607
+ max_new_tokens=max_new_tokens,
608
+ use_cache=True,
609
+ # stopping_criteria=[stopping_criteria],
610
+ )
611
+
612
+ # print('inference over')
613
+ generation_time = time.time() - stime
614
+ outputs = tokenizer.batch_decode(
615
+ output_ids, skip_special_tokens=True
616
+ )[0]
617
+
618
+ outputs = outputs.strip()
619
+
620
+ return outputs, generation_time
621
+
622
+
623
+ AutoConfig.register("tinyllava", TinyLlavaConfig)
624
+ AutoModelForCausalLM.register(TinyLlavaConfig, TinyLlavaForConditionalGeneration)