Files changed (3) hide show
  1. conversation.py +389 -0
  2. modeling_intern_vit.py +463 -0
  3. modeling_internvl_chat.py +382 -0
conversation.py ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation prompt templates.
3
+
4
+ We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
+ If you have changes in mind, please contribute back so the community can benefit collectively
6
+ and continue to maintain these valuable templates.
7
+
8
+ Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
9
+ """
10
+
11
+ import dataclasses
12
+ from enum import IntEnum, auto
13
+ from typing import Dict, List, Tuple, Union
14
+
15
+
16
+ class SeparatorStyle(IntEnum):
17
+ """Separator styles."""
18
+
19
+ ADD_COLON_SINGLE = auto()
20
+ ADD_COLON_TWO = auto()
21
+ ADD_COLON_SPACE_SINGLE = auto()
22
+ NO_COLON_SINGLE = auto()
23
+ NO_COLON_TWO = auto()
24
+ ADD_NEW_LINE_SINGLE = auto()
25
+ LLAMA2 = auto()
26
+ CHATGLM = auto()
27
+ CHATML = auto()
28
+ CHATINTERN = auto()
29
+ DOLLY = auto()
30
+ RWKV = auto()
31
+ PHOENIX = auto()
32
+ ROBIN = auto()
33
+ FALCON_CHAT = auto()
34
+ CHATGLM3 = auto()
35
+ INTERNVL_ZH = auto()
36
+ MPT = auto()
37
+
38
+
39
+ @dataclasses.dataclass
40
+ class Conversation:
41
+ """A class that manages prompt templates and keeps all conversation history."""
42
+
43
+ # The name of this template
44
+ name: str
45
+ # The template of the system prompt
46
+ system_template: str = '{system_message}'
47
+ # The system message
48
+ system_message: str = ''
49
+ # The names of two roles
50
+ roles: Tuple[str] = ('USER', 'ASSISTANT')
51
+ # All messages. Each item is (role, message).
52
+ messages: List[List[str]] = ()
53
+ # The number of few shot examples
54
+ offset: int = 0
55
+ # The separator style and configurations
56
+ sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
57
+ sep: str = '\n'
58
+ sep2: str = None
59
+ # Stop criteria (the default one is EOS token)
60
+ stop_str: Union[str, List[str]] = None
61
+ # Stops generation if meeting any token in this list
62
+ stop_token_ids: List[int] = None
63
+
64
+ def get_prompt(self) -> str:
65
+ """Get the prompt for generation."""
66
+ system_prompt = self.system_template.format(system_message=self.system_message)
67
+ if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
68
+ ret = system_prompt + self.sep
69
+ for role, message in self.messages:
70
+ if message:
71
+ ret += role + ': ' + message + self.sep
72
+ else:
73
+ ret += role + ':'
74
+ return ret
75
+ elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
76
+ seps = [self.sep, self.sep2]
77
+ ret = system_prompt + seps[0]
78
+ for i, (role, message) in enumerate(self.messages):
79
+ if message:
80
+ ret += role + ': ' + message + seps[i % 2]
81
+ else:
82
+ ret += role + ':'
83
+ return ret
84
+ elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
85
+ ret = system_prompt + self.sep
86
+ for role, message in self.messages:
87
+ if message:
88
+ ret += role + ': ' + message + self.sep
89
+ else:
90
+ ret += role + ': ' # must end with a space
91
+ return ret
92
+ elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
93
+ ret = '' if system_prompt == '' else system_prompt + self.sep
94
+ for role, message in self.messages:
95
+ if message:
96
+ ret += role + '\n' + message + self.sep
97
+ else:
98
+ ret += role + '\n'
99
+ return ret
100
+ elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
101
+ ret = system_prompt
102
+ for role, message in self.messages:
103
+ if message:
104
+ ret += role + message + self.sep
105
+ else:
106
+ ret += role
107
+ return ret
108
+ elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
109
+ seps = [self.sep, self.sep2]
110
+ ret = system_prompt
111
+ for i, (role, message) in enumerate(self.messages):
112
+ if message:
113
+ ret += role + message + seps[i % 2]
114
+ else:
115
+ ret += role
116
+ return ret
117
+ elif self.sep_style == SeparatorStyle.RWKV:
118
+ ret = system_prompt
119
+ for i, (role, message) in enumerate(self.messages):
120
+ if message:
121
+ ret += (
122
+ role
123
+ + ': '
124
+ + message.replace('\r\n', '\n').replace('\n\n', '\n')
125
+ )
126
+ ret += '\n\n'
127
+ else:
128
+ ret += role + ':'
129
+ return ret
130
+ elif self.sep_style == SeparatorStyle.LLAMA2:
131
+ seps = [self.sep, self.sep2]
132
+ if self.system_message:
133
+ ret = system_prompt
134
+ else:
135
+ ret = '[INST] '
136
+ for i, (role, message) in enumerate(self.messages):
137
+ tag = self.roles[i % 2]
138
+ if message:
139
+ if i == 0:
140
+ ret += message + ' '
141
+ else:
142
+ ret += tag + ' ' + message + seps[i % 2]
143
+ else:
144
+ ret += tag
145
+ return ret
146
+ elif self.sep_style == SeparatorStyle.CHATGLM:
147
+ # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
148
+ # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
149
+ round_add_n = 1 if self.name == 'chatglm2' else 0
150
+ if system_prompt:
151
+ ret = system_prompt + self.sep
152
+ else:
153
+ ret = ''
154
+
155
+ for i, (role, message) in enumerate(self.messages):
156
+ if i % 2 == 0:
157
+ ret += f'[Round {i // 2 + round_add_n}]{self.sep}'
158
+
159
+ if message:
160
+ ret += f'{role}:{message}{self.sep}'
161
+ else:
162
+ ret += f'{role}:'
163
+ return ret
164
+ elif self.sep_style == SeparatorStyle.CHATML:
165
+ ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
166
+ for role, message in self.messages:
167
+ if message:
168
+ ret += role + '\n' + message + self.sep + '\n'
169
+ else:
170
+ ret += role + '\n'
171
+ return ret
172
+ elif self.sep_style == SeparatorStyle.CHATGLM3:
173
+ ret = ''
174
+ if self.system_message:
175
+ ret += system_prompt
176
+ for role, message in self.messages:
177
+ if message:
178
+ ret += role + '\n' + ' ' + message
179
+ else:
180
+ ret += role
181
+ return ret
182
+ elif self.sep_style == SeparatorStyle.CHATINTERN:
183
+ # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
184
+ seps = [self.sep, self.sep2]
185
+ ret = system_prompt
186
+ for i, (role, message) in enumerate(self.messages):
187
+ if message:
188
+ ret += role + ':' + message + seps[i % 2] + '\n'
189
+ else:
190
+ ret += role + ':'
191
+ return ret
192
+ elif self.sep_style == SeparatorStyle.DOLLY:
193
+ seps = [self.sep, self.sep2]
194
+ ret = system_prompt
195
+ for i, (role, message) in enumerate(self.messages):
196
+ if message:
197
+ ret += role + ':\n' + message + seps[i % 2]
198
+ if i % 2 == 1:
199
+ ret += '\n\n'
200
+ else:
201
+ ret += role + ':\n'
202
+ return ret
203
+ elif self.sep_style == SeparatorStyle.PHOENIX:
204
+ ret = system_prompt
205
+ for role, message in self.messages:
206
+ if message:
207
+ ret += role + ': ' + '<s>' + message + '</s>'
208
+ else:
209
+ ret += role + ': ' + '<s>'
210
+ return ret
211
+ elif self.sep_style == SeparatorStyle.ROBIN:
212
+ ret = system_prompt + self.sep
213
+ for role, message in self.messages:
214
+ if message:
215
+ ret += role + ':\n' + message + self.sep
216
+ else:
217
+ ret += role + ':\n'
218
+ return ret
219
+ elif self.sep_style == SeparatorStyle.FALCON_CHAT:
220
+ ret = ''
221
+ if self.system_message:
222
+ ret += system_prompt + self.sep
223
+ for role, message in self.messages:
224
+ if message:
225
+ ret += role + ': ' + message + self.sep
226
+ else:
227
+ ret += role + ':'
228
+ return ret
229
+ elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
230
+ seps = [self.sep, self.sep2]
231
+ ret = self.system_message + seps[0]
232
+ for i, (role, message) in enumerate(self.messages):
233
+ if message:
234
+ ret += role + ': ' + message + seps[i % 2]
235
+ else:
236
+ ret += role + ':'
237
+ return ret
238
+ elif self.sep_style == SeparatorStyle.MPT:
239
+ ret = system_prompt + self.sep
240
+ for role, message in self.messages:
241
+ if message:
242
+ if type(message) is tuple:
243
+ message, _, _ = message
244
+ ret += role + message + self.sep
245
+ else:
246
+ ret += role
247
+ return ret
248
+ else:
249
+ raise ValueError(f'Invalid style: {self.sep_style}')
250
+
251
+ def set_system_message(self, system_message: str):
252
+ """Set the system message."""
253
+ self.system_message = system_message
254
+
255
+ def append_message(self, role: str, message: str):
256
+ """Append a new message."""
257
+ self.messages.append([role, message])
258
+
259
+ def update_last_message(self, message: str):
260
+ """Update the last output.
261
+
262
+ The last message is typically set to be None when constructing the prompt,
263
+ so we need to update it in-place after getting the response from a model.
264
+ """
265
+ self.messages[-1][1] = message
266
+
267
+ def to_gradio_chatbot(self):
268
+ """Convert the conversation to gradio chatbot format."""
269
+ ret = []
270
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
271
+ if i % 2 == 0:
272
+ ret.append([msg, None])
273
+ else:
274
+ ret[-1][-1] = msg
275
+ return ret
276
+
277
+ def to_openai_api_messages(self):
278
+ """Convert the conversation to OpenAI chat completion format."""
279
+ ret = [{'role': 'system', 'content': self.system_message}]
280
+
281
+ for i, (_, msg) in enumerate(self.messages[self.offset:]):
282
+ if i % 2 == 0:
283
+ ret.append({'role': 'user', 'content': msg})
284
+ else:
285
+ if msg is not None:
286
+ ret.append({'role': 'assistant', 'content': msg})
287
+ return ret
288
+
289
+ def copy(self):
290
+ return Conversation(
291
+ name=self.name,
292
+ system_template=self.system_template,
293
+ system_message=self.system_message,
294
+ roles=self.roles,
295
+ messages=[[x, y] for x, y in self.messages],
296
+ offset=self.offset,
297
+ sep_style=self.sep_style,
298
+ sep=self.sep,
299
+ sep2=self.sep2,
300
+ stop_str=self.stop_str,
301
+ stop_token_ids=self.stop_token_ids,
302
+ )
303
+
304
+ def dict(self):
305
+ return {
306
+ 'template_name': self.name,
307
+ 'system_message': self.system_message,
308
+ 'roles': self.roles,
309
+ 'messages': self.messages,
310
+ 'offset': self.offset,
311
+ }
312
+
313
+
314
+ # A global registry for all conversation templates
315
+ conv_templates: Dict[str, Conversation] = {}
316
+
317
+
318
+ def register_conv_template(template: Conversation, override: bool = False):
319
+ """Register a new conversation template."""
320
+ if not override:
321
+ assert (
322
+ template.name not in conv_templates
323
+ ), f'{template.name} has been registered.'
324
+
325
+ conv_templates[template.name] = template
326
+
327
+
328
+ def get_conv_template(name: str) -> Conversation:
329
+ """Get a conversation template."""
330
+ return conv_templates[name].copy()
331
+
332
+
333
+ # Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
334
+ # is that during training, the preprocessing function for the Hermes-2 template doesn't add
335
+ # <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
336
+ # Therefore, they are completely equivalent during inference.
337
+ register_conv_template(
338
+ Conversation(
339
+ name='Hermes-2',
340
+ system_template='<|im_start|>system\n{system_message}',
341
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
342
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
343
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
344
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
345
+ sep_style=SeparatorStyle.MPT,
346
+ sep='<|im_end|>',
347
+ stop_str='<|endoftext|>',
348
+ )
349
+ )
350
+
351
+
352
+ register_conv_template(
353
+ Conversation(
354
+ name='internlm2-chat',
355
+ system_template='<|im_start|>system\n{system_message}',
356
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
357
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
358
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
359
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
360
+ sep_style=SeparatorStyle.MPT,
361
+ sep='<|im_end|>',
362
+ )
363
+ )
364
+
365
+
366
+ register_conv_template(
367
+ Conversation(
368
+ name='phi3-chat',
369
+ system_template='<|system|>\n{system_message}',
370
+ # note: The new system prompt was not used here to avoid changes in benchmark performance.
371
+ # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
372
+ system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。',
373
+ roles=('<|user|>\n', '<|assistant|>\n'),
374
+ sep_style=SeparatorStyle.MPT,
375
+ sep='<|end|>',
376
+ )
377
+ )
378
+
379
+
380
+ register_conv_template(
381
+ Conversation(
382
+ name='internvl2_5',
383
+ system_template='<|im_start|>system\n{system_message}',
384
+ system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
385
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
386
+ sep_style=SeparatorStyle.MPT,
387
+ sep='<|im_end|>\n',
388
+ )
389
+ )
modeling_intern_vit.py ADDED
@@ -0,0 +1,463 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ from typing import Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.nn.functional as F
11
+ import torch.utils.checkpoint
12
+ from einops import rearrange
13
+ from timm.layers import DropPath
14
+ from torch import nn
15
+ from transformers.activations import ACT2FN
16
+ from transformers.modeling_outputs import (BaseModelOutput,
17
+ BaseModelOutputWithPooling)
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import logging
20
+
21
+ from .configuration_intern_vit import InternVisionConfig
22
+
23
+ try:
24
+ from flash_attn.bert_padding import pad_input, unpad_input
25
+ from flash_attn.flash_attn_interface import \
26
+ flash_attn_varlen_qkvpacked_func
27
+ has_flash_attn = True
28
+ except:
29
+ print('FlashAttention2 is not installed.')
30
+ has_flash_attn = False
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+
35
+ class FlashAttention(nn.Module):
36
+ """Implement the scaled dot product attention with softmax.
37
+
38
+ Arguments
39
+ ---------
40
+ softmax_scale: The temperature to use for the softmax attention.
41
+ (default: 1/sqrt(d_keys) where d_keys is computed at runtime)
42
+ attention_dropout: The dropout rate to apply to the attention
43
+ (default: 0.0)
44
+ """
45
+
46
+ def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
47
+ super().__init__()
48
+ self.softmax_scale = softmax_scale
49
+ self.dropout_p = attention_dropout
50
+
51
+ def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
52
+ max_s=None, need_weights=False):
53
+ """Implements the multihead softmax attention.
54
+
55
+ Arguments
56
+ ---------
57
+ qkv: The tensor containing the query, key, and value.
58
+ (B, S, 3, H, D) if key_padding_mask is None
59
+ if unpadded: (nnz, 3, h, d)
60
+ key_padding_mask: a bool tensor of shape (B, S)
61
+ """
62
+ assert not need_weights
63
+ assert qkv.dtype in [torch.float16, torch.bfloat16]
64
+ assert qkv.is_cuda
65
+
66
+ if cu_seqlens is None:
67
+ batch_size = qkv.shape[0]
68
+ seqlen = qkv.shape[1]
69
+ if key_padding_mask is None:
70
+ qkv = rearrange(qkv, 'b s ... -> (b s) ...')
71
+ max_s = seqlen
72
+ cu_seqlens = torch.arange(
73
+ 0, (batch_size + 1) * seqlen, step=seqlen,
74
+ dtype=torch.int32, device=qkv.device
75
+ )
76
+ output = flash_attn_varlen_qkvpacked_func(
77
+ qkv, cu_seqlens, max_s,
78
+ self.dropout_p if self.training else 0.0,
79
+ softmax_scale=self.softmax_scale, causal=causal
80
+ )
81
+ output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
82
+ else:
83
+ nheads = qkv.shape[-2]
84
+ x = rearrange(qkv, 'b s three h d -> b s (three h d)')
85
+ x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
86
+ x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d',
87
+ three=3, h=nheads)
88
+ output_unpad = flash_attn_varlen_qkvpacked_func(
89
+ x_unpad, cu_seqlens, max_s,
90
+ self.dropout_p if self.training else 0.0,
91
+ softmax_scale=self.softmax_scale, causal=causal
92
+ )
93
+ output = rearrange(
94
+ pad_input(
95
+ rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
96
+ indices, batch_size, seqlen
97
+ ),
98
+ 'b s (h d) -> b s h d', h=nheads
99
+ )
100
+ else:
101
+ assert max_s is not None
102
+ output = flash_attn_varlen_qkvpacked_func(
103
+ qkv, cu_seqlens, max_s,
104
+ self.dropout_p if self.training else 0.0,
105
+ softmax_scale=self.softmax_scale, causal=causal
106
+ )
107
+
108
+ return output, None
109
+
110
+
111
+ class InternRMSNorm(nn.Module):
112
+ def __init__(self, hidden_size, eps=1e-6):
113
+ super().__init__()
114
+ self.weight = nn.Parameter(torch.ones(hidden_size))
115
+ self.variance_epsilon = eps
116
+
117
+ def forward(self, hidden_states):
118
+ input_dtype = hidden_states.dtype
119
+ hidden_states = hidden_states.to(torch.float32)
120
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
121
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
122
+ return self.weight * hidden_states.to(input_dtype)
123
+
124
+
125
+ try:
126
+ from apex.normalization import FusedRMSNorm
127
+
128
+ InternRMSNorm = FusedRMSNorm # noqa
129
+
130
+ logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
131
+ except ImportError:
132
+ # using the normal InternRMSNorm
133
+ pass
134
+ except Exception:
135
+ logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
136
+ pass
137
+
138
+
139
+ NORM2FN = {
140
+ 'rms_norm': InternRMSNorm,
141
+ 'layer_norm': nn.LayerNorm,
142
+ }
143
+
144
+
145
+ class InternVisionEmbeddings(nn.Module):
146
+ def __init__(self, config: InternVisionConfig):
147
+ super().__init__()
148
+ self.config = config
149
+ self.embed_dim = config.hidden_size
150
+ self.image_size = config.image_size
151
+ self.patch_size = config.patch_size
152
+
153
+ self.class_embedding = nn.Parameter(
154
+ torch.randn(1, 1, self.embed_dim),
155
+ )
156
+
157
+ self.patch_embedding = nn.Conv2d(
158
+ in_channels=3,
159
+ out_channels=self.embed_dim,
160
+ kernel_size=self.patch_size,
161
+ stride=self.patch_size
162
+ )
163
+
164
+ self.num_patches = (self.image_size // self.patch_size) ** 2
165
+ self.num_positions = self.num_patches + 1
166
+
167
+ self.position_embedding = nn.Parameter(
168
+ torch.randn(1, self.num_positions, self.embed_dim)
169
+ )
170
+
171
+ def _get_pos_embed(self, pos_embed, H, W):
172
+ target_dtype = pos_embed.dtype
173
+ pos_embed = pos_embed.float().reshape(
174
+ 1, self.image_size // self.patch_size,
175
+ self.image_size // self.patch_size, -1
176
+ ).permute(0, 3, 1, 2)
177
+ pos_embed = F.interpolate(
178
+ pos_embed, size=(H, W), mode='bicubic', align_corners=False
179
+ ).reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
180
+ return pos_embed
181
+
182
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
183
+ target_dtype = self.patch_embedding.weight.dtype
184
+ patch_embeds = self.patch_embedding(pixel_values) # [*, channel, width, height]
185
+ batch_size, _, height, width = patch_embeds.shape
186
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
187
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
188
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
189
+ position_embedding = torch.cat([
190
+ self.position_embedding[:, :1, :],
191
+ self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
192
+ ], dim=1)
193
+ embeddings = embeddings + position_embedding.to(target_dtype)
194
+ return embeddings
195
+
196
+
197
+ class InternAttention(nn.Module):
198
+ """Multi-headed attention from 'Attention Is All You Need' paper."""
199
+
200
+ def __init__(self, config: InternVisionConfig):
201
+ super().__init__()
202
+ self.config = config
203
+ self.embed_dim = config.hidden_size
204
+ self.num_heads = config.num_attention_heads
205
+ self.use_flash_attn = config.use_flash_attn and has_flash_attn
206
+ if config.use_flash_attn and not has_flash_attn:
207
+ print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
208
+ self.head_dim = self.embed_dim // self.num_heads
209
+ if self.head_dim * self.num_heads != self.embed_dim:
210
+ raise ValueError(
211
+ f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} '
212
+ f'and `num_heads`: {self.num_heads}).'
213
+ )
214
+
215
+ self.scale = self.head_dim ** -0.5
216
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
217
+ self.attn_drop = nn.Dropout(config.attention_dropout)
218
+ self.proj_drop = nn.Dropout(config.dropout)
219
+
220
+ self.qk_normalization = config.qk_normalization
221
+
222
+ if self.qk_normalization:
223
+ self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
224
+ self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
225
+
226
+ if self.use_flash_attn:
227
+ self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
228
+ self.proj = nn.Linear(self.embed_dim, self.embed_dim)
229
+
230
+ def _naive_attn(self, x):
231
+ B, N, C = x.shape
232
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
233
+ q, k, v = qkv.unbind(0)
234
+
235
+ if self.qk_normalization:
236
+ B_, H_, N_, D_ = q.shape
237
+ q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
238
+ k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
239
+
240
+ attn = (q * self.scale) @ k.transpose(-2, -1)
241
+ attn = attn.softmax(dim=-1)
242
+ attn = self.attn_drop(attn)
243
+
244
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
245
+ x = self.proj(x)
246
+ x = self.proj_drop(x)
247
+ return x
248
+
249
+ def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
250
+ qkv = self.qkv(x)
251
+ qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
252
+
253
+ if self.qk_normalization:
254
+ q, k, v = qkv.unbind(2)
255
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
256
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
257
+ qkv = torch.stack([q, k, v], dim=2)
258
+
259
+ context, _ = self.inner_attn(
260
+ qkv, key_padding_mask=key_padding_mask,
261
+ need_weights=need_weights, causal=False
262
+ )
263
+ outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
264
+ outs = self.proj_drop(outs)
265
+ return outs
266
+
267
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
268
+ x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
269
+ return x
270
+
271
+
272
+ class InternMLP(nn.Module):
273
+ def __init__(self, config: InternVisionConfig):
274
+ super().__init__()
275
+ self.config = config
276
+ self.act = ACT2FN[config.hidden_act]
277
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
278
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
279
+
280
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
281
+ hidden_states = self.fc1(hidden_states)
282
+ hidden_states = self.act(hidden_states)
283
+ hidden_states = self.fc2(hidden_states)
284
+ return hidden_states
285
+
286
+
287
+ class InternVisionEncoderLayer(nn.Module):
288
+ def __init__(self, config: InternVisionConfig, drop_path_rate: float):
289
+ super().__init__()
290
+ self.embed_dim = config.hidden_size
291
+ self.intermediate_size = config.intermediate_size
292
+ self.norm_type = config.norm_type
293
+
294
+ self.attn = InternAttention(config)
295
+ self.mlp = InternMLP(config)
296
+ self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
297
+ self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
298
+
299
+ self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
300
+ self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
301
+ self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
302
+ self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
303
+
304
+ def forward(
305
+ self,
306
+ hidden_states: torch.Tensor,
307
+ ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
308
+ """
309
+ Args:
310
+ hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`):
311
+ input to the layer of shape `(batch, seq_len, embed_dim)`
312
+ """
313
+ hidden_states = hidden_states + self.drop_path1(
314
+ self.attn(self.norm1(hidden_states).to(hidden_states.dtype)) * self.ls1
315
+ )
316
+ hidden_states = hidden_states + self.drop_path2(
317
+ self.mlp(self.norm2(hidden_states).to(hidden_states.dtype)) * self.ls2
318
+ )
319
+ return hidden_states
320
+
321
+
322
+ class InternVisionEncoder(nn.Module):
323
+ """
324
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers.
325
+ Each layer is a [`InternVisionEncoderLayer`].
326
+
327
+ Args:
328
+ config (`InternVisionConfig`):
329
+ The corresponding vision configuration for the `InternVisionEncoder`.
330
+ """
331
+
332
+ def __init__(self, config: InternVisionConfig):
333
+ super().__init__()
334
+ self.config = config
335
+ # stochastic depth decay rule
336
+ dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
337
+ self.layers = nn.ModuleList([
338
+ InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)
339
+ ])
340
+ self.gradient_checkpointing = True
341
+
342
+ def forward(
343
+ self,
344
+ inputs_embeds,
345
+ output_hidden_states: Optional[bool] = None,
346
+ return_dict: Optional[bool] = None,
347
+ ) -> Union[Tuple, BaseModelOutput]:
348
+ r"""
349
+ Args:
350
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
351
+ Embedded representation of the inputs. Should be float, not int tokens.
352
+ output_hidden_states (`bool`, *optional*):
353
+ Whether or not to return the hidden states of all layers. See `hidden_states`
354
+ under returned tensors for more detail.
355
+ return_dict (`bool`, *optional*):
356
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
357
+ """
358
+ output_hidden_states = (
359
+ output_hidden_states if output_hidden_states is not None
360
+ else self.config.output_hidden_states
361
+ )
362
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
363
+
364
+ encoder_states = () if output_hidden_states else None
365
+ hidden_states = inputs_embeds
366
+
367
+ for idx, encoder_layer in enumerate(self.layers):
368
+ if output_hidden_states:
369
+ encoder_states = encoder_states + (hidden_states,)
370
+ if self.gradient_checkpointing and self.training:
371
+ layer_outputs = torch.utils.checkpoint.checkpoint(
372
+ encoder_layer,
373
+ hidden_states
374
+ )
375
+ else:
376
+ layer_outputs = encoder_layer(hidden_states)
377
+ hidden_states = layer_outputs
378
+
379
+ if output_hidden_states:
380
+ encoder_states = encoder_states + (hidden_states,)
381
+
382
+ if not return_dict:
383
+ return tuple(v for v in [hidden_states, encoder_states] if v is not None)
384
+ return BaseModelOutput(
385
+ last_hidden_state=hidden_states,
386
+ hidden_states=encoder_states
387
+ )
388
+
389
+
390
+ class InternVisionModel(PreTrainedModel):
391
+ main_input_name = 'pixel_values'
392
+ _supports_flash_attn_2 = True
393
+ supports_gradient_checkpointing = True
394
+ config_class = InternVisionConfig
395
+ _no_split_modules = ['InternVisionEncoderLayer']
396
+
397
+ def __init__(self, config: InternVisionConfig):
398
+ super().__init__(config)
399
+ self.config = config
400
+
401
+ self.embeddings = InternVisionEmbeddings(config)
402
+ self.encoder = InternVisionEncoder(config)
403
+
404
+ def resize_pos_embeddings(self, old_size, new_size, patch_size):
405
+ pos_emb = self.embeddings.position_embedding
406
+ _, num_positions, embed_dim = pos_emb.shape
407
+ cls_emb = pos_emb[:, :1, :]
408
+ pos_emb = pos_emb[:, 1:, :].reshape(
409
+ 1, old_size // patch_size, old_size // patch_size, -1
410
+ ).permute(0, 3, 1, 2)
411
+ pos_emb = F.interpolate(
412
+ pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False
413
+ )
414
+ pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
415
+ pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
416
+ self.embeddings.position_embedding = nn.Parameter(pos_emb)
417
+ self.embeddings.image_size = new_size
418
+ logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
419
+
420
+ def get_input_embeddings(self):
421
+ return self.embeddings
422
+
423
+ def forward(
424
+ self,
425
+ pixel_values: Optional[torch.FloatTensor] = None,
426
+ output_hidden_states: Optional[bool] = None,
427
+ return_dict: Optional[bool] = None,
428
+ pixel_embeds: Optional[torch.FloatTensor] = None,
429
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
430
+ output_hidden_states = (
431
+ output_hidden_states if output_hidden_states is not None
432
+ else self.config.output_hidden_states
433
+ )
434
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
435
+
436
+ if pixel_values is None and pixel_embeds is None:
437
+ raise ValueError('You have to specify pixel_values or pixel_embeds')
438
+
439
+ if pixel_embeds is not None:
440
+ hidden_states = pixel_embeds
441
+ else:
442
+ if len(pixel_values.shape) == 4:
443
+ hidden_states = self.embeddings(pixel_values)
444
+ else:
445
+ raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
446
+
447
+ encoder_outputs = self.encoder(
448
+ inputs_embeds=hidden_states,
449
+ output_hidden_states=output_hidden_states,
450
+ return_dict=return_dict,
451
+ )
452
+ last_hidden_state = encoder_outputs.last_hidden_state
453
+ pooled_output = last_hidden_state[:, 0, :]
454
+
455
+ if not return_dict:
456
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
457
+
458
+ return BaseModelOutputWithPooling(
459
+ last_hidden_state=last_hidden_state,
460
+ pooler_output=pooled_output,
461
+ hidden_states=encoder_outputs.hidden_states,
462
+ attentions=encoder_outputs.attentions,
463
+ )
modeling_internvl_chat.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # InternVL
3
+ # Copyright (c) 2024 OpenGVLab
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # --------------------------------------------------------
6
+
7
+ import warnings
8
+ from typing import List, Optional, Tuple, Union
9
+
10
+ import torch
11
+ import torch.utils.checkpoint
12
+ import transformers
13
+ from torch import nn
14
+ from torch.nn import CrossEntropyLoss
15
+ from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM,
16
+ Qwen2ForCausalLM)
17
+ from transformers.modeling_outputs import CausalLMOutputWithPast
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import ModelOutput, logging
20
+
21
+ from .configuration_internvl_chat import InternVLChatConfig
22
+ from .conversation import get_conv_template
23
+ from .modeling_intern_vit import InternVisionModel, has_flash_attn
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ def version_cmp(v1, v2, op='eq'):
29
+ import operator
30
+ from packaging import version
31
+ op_func = getattr(operator, op)
32
+ return op_func(version.parse(v1), version.parse(v2))
33
+
34
+
35
+ class InternVLChatModel(PreTrainedModel):
36
+ config_class = InternVLChatConfig
37
+ main_input_name = 'pixel_values'
38
+ base_model_prefix = 'language_model'
39
+ _supports_flash_attn_2 = True
40
+ supports_gradient_checkpointing = True
41
+ _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'Qwen2DecoderLayer']
42
+ _tied_weights_keys = []
43
+
44
+ @property
45
+ def all_tied_weights_keys(self):
46
+ # In newer Transformers, PreTrainedModel.all_tied_weights_keys is a property that
47
+ # can raise AttributeError internally when submodules haven't declared
48
+ # _tied_weights_keys, causing nn.Module.__getattr__ to re-raise it as
49
+ # "object has no attribute 'all_tied_weights_keys'".
50
+ # Override to always return an empty dict: this model has no tied weights.
51
+ return {}
52
+
53
+ def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
54
+ super().__init__(config)
55
+
56
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
57
+ image_size = config.force_image_size or config.vision_config.image_size
58
+ patch_size = config.vision_config.patch_size
59
+ self.patch_size = patch_size
60
+ self.select_layer = config.select_layer
61
+ self.template = config.template
62
+ self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2))
63
+ self.downsample_ratio = config.downsample_ratio
64
+ self.ps_version = config.ps_version
65
+ use_flash_attn = use_flash_attn if has_flash_attn else False
66
+ config.vision_config.use_flash_attn = True if use_flash_attn else False
67
+ config.llm_config._attn_implementation = 'flash_attention_2' if use_flash_attn else 'eager'
68
+
69
+ logger.info(f'num_image_token: {self.num_image_token}')
70
+ logger.info(f'ps_version: {self.ps_version}')
71
+
72
+ if vision_model is not None:
73
+ self.vision_model = vision_model
74
+ else:
75
+ self.vision_model = InternVisionModel(config.vision_config)
76
+
77
+ if language_model is not None:
78
+ self.language_model = language_model
79
+ else:
80
+ if config.llm_config.architectures[0] == 'LlamaForCausalLM':
81
+ self.language_model = LlamaForCausalLM(config.llm_config)
82
+ elif config.llm_config.architectures[0] == 'Qwen2ForCausalLM':
83
+ self.language_model = Qwen2ForCausalLM(config.llm_config)
84
+ else:
85
+ raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.')
86
+
87
+ vit_hidden_size = config.vision_config.hidden_size
88
+ llm_hidden_size = config.llm_config.hidden_size
89
+
90
+ self.mlp1 = nn.Sequential(
91
+ nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2),
92
+ nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size),
93
+ nn.GELU(),
94
+ nn.Linear(llm_hidden_size, llm_hidden_size)
95
+ )
96
+
97
+ self.img_context_token_id = None
98
+ self.conv_template = get_conv_template(self.template)
99
+ self.system_message = self.conv_template.system_message
100
+
101
+ def forward(
102
+ self,
103
+ pixel_values: torch.FloatTensor,
104
+ input_ids: torch.LongTensor = None,
105
+ attention_mask: Optional[torch.Tensor] = None,
106
+ position_ids: Optional[torch.LongTensor] = None,
107
+ image_flags: Optional[torch.LongTensor] = None,
108
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
109
+ labels: Optional[torch.LongTensor] = None,
110
+ use_cache: Optional[bool] = None,
111
+ output_attentions: Optional[bool] = None,
112
+ output_hidden_states: Optional[bool] = None,
113
+ return_dict: Optional[bool] = None,
114
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
115
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
116
+
117
+ image_flags = image_flags.squeeze(-1)
118
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
119
+
120
+ vit_embeds = self.extract_feature(pixel_values)
121
+ vit_embeds = vit_embeds[image_flags == 1]
122
+ vit_batch_size = pixel_values.shape[0]
123
+
124
+ B, N, C = input_embeds.shape
125
+ input_embeds = input_embeds.reshape(B * N, C)
126
+
127
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
128
+ print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: '
129
+ f'{vit_batch_size / B}, dynamic token length: {N}')
130
+
131
+ input_ids = input_ids.reshape(B * N)
132
+ selected = (input_ids == self.img_context_token_id)
133
+ try:
134
+ input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C)
135
+ except Exception as e:
136
+ vit_embeds = vit_embeds.reshape(-1, C)
137
+ print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, '
138
+ f'vit_embeds.shape={vit_embeds.shape}')
139
+ n_token = min(selected.sum(), vit_embeds.size(0))
140
+ input_embeds[selected][:n_token] = (
141
+ input_embeds[selected][:n_token] * 0.0 + vit_embeds[:n_token]
142
+ )
143
+
144
+ input_embeds = input_embeds.reshape(B, N, C)
145
+
146
+ outputs = self.language_model(
147
+ inputs_embeds=input_embeds,
148
+ attention_mask=attention_mask,
149
+ position_ids=position_ids,
150
+ past_key_values=past_key_values,
151
+ use_cache=use_cache,
152
+ output_attentions=output_attentions,
153
+ output_hidden_states=output_hidden_states,
154
+ return_dict=return_dict,
155
+ )
156
+ logits = outputs.logits
157
+
158
+ loss = None
159
+ if labels is not None:
160
+ # Shift so that tokens < n predict n
161
+ shift_logits = logits[..., :-1, :].contiguous()
162
+ shift_labels = labels[..., 1:].contiguous()
163
+ # Flatten the tokens
164
+ loss_fct = CrossEntropyLoss()
165
+ shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size)
166
+ shift_labels = shift_labels.view(-1)
167
+ # Enable model parallelism
168
+ shift_labels = shift_labels.to(shift_logits.device)
169
+ loss = loss_fct(shift_logits, shift_labels)
170
+
171
+ if not return_dict:
172
+ output = (logits,) + outputs[1:]
173
+ return (loss,) + output if loss is not None else output
174
+
175
+ return CausalLMOutputWithPast(
176
+ loss=loss,
177
+ logits=logits,
178
+ past_key_values=outputs.past_key_values,
179
+ hidden_states=outputs.hidden_states,
180
+ attentions=outputs.attentions,
181
+ )
182
+
183
+ def pixel_shuffle(self, x, scale_factor=0.5):
184
+ n, w, h, c = x.size()
185
+ # N, W, H, C --> N, W, H * scale, C // scale
186
+ x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
187
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
188
+ x = x.permute(0, 2, 1, 3).contiguous()
189
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
190
+ x = x.view(n, int(h * scale_factor), int(w * scale_factor),
191
+ int(c / (scale_factor * scale_factor)))
192
+ if self.ps_version == 'v1':
193
+ warnings.warn("In ps_version 'v1', the height and width have not been swapped back, "
194
+ 'which results in a transposed image.')
195
+ else:
196
+ x = x.permute(0, 2, 1, 3).contiguous()
197
+ return x
198
+
199
+ def extract_feature(self, pixel_values):
200
+ if self.select_layer == -1:
201
+ vit_embeds = self.vision_model(
202
+ pixel_values=pixel_values,
203
+ output_hidden_states=False,
204
+ return_dict=True
205
+ ).last_hidden_state
206
+ else:
207
+ vit_embeds = self.vision_model(
208
+ pixel_values=pixel_values,
209
+ output_hidden_states=True,
210
+ return_dict=True
211
+ ).hidden_states[self.select_layer]
212
+ vit_embeds = vit_embeds[:, 1:, :]
213
+
214
+ h = w = int(vit_embeds.shape[1] ** 0.5)
215
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
216
+ vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
217
+ vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
218
+ vit_embeds = self.mlp1(vit_embeds)
219
+ return vit_embeds
220
+
221
+ def batch_chat(self, tokenizer, pixel_values, questions, generation_config,
222
+ num_patches_list=None, history=None, return_history=False,
223
+ IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
224
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False, image_counts=None):
225
+ if history is not None or return_history:
226
+ print('Now multi-turn chat is not supported in batch_chat.')
227
+ raise NotImplementedError
228
+
229
+ if image_counts is not None:
230
+ num_patches_list = image_counts
231
+ print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.')
232
+
233
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
234
+ self.img_context_token_id = img_context_token_id
235
+
236
+ if verbose and pixel_values is not None:
237
+ image_bs = pixel_values.shape[0]
238
+ print(f'dynamic ViT batch size: {image_bs}')
239
+
240
+ queries = []
241
+ for idx, num_patches in enumerate(num_patches_list):
242
+ question = questions[idx]
243
+ if pixel_values is not None and '<image>' not in question:
244
+ question = '<image>\n' + question
245
+ template = get_conv_template(self.template)
246
+ template.system_message = self.system_message
247
+ template.append_message(template.roles[0], question)
248
+ template.append_message(template.roles[1], None)
249
+ query = template.get_prompt()
250
+
251
+ image_tokens = (IMG_START_TOKEN
252
+ + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
253
+ + IMG_END_TOKEN)
254
+ query = query.replace('<image>', image_tokens, 1)
255
+ queries.append(query)
256
+
257
+ tokenizer.padding_side = 'left'
258
+ model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
259
+ input_ids = model_inputs['input_ids'].to(self.device)
260
+ attention_mask = model_inputs['attention_mask'].to(self.device)
261
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
262
+ generation_config['eos_token_id'] = eos_token_id
263
+ generation_output = self.generate(
264
+ pixel_values=pixel_values,
265
+ input_ids=input_ids,
266
+ attention_mask=attention_mask,
267
+ **generation_config
268
+ )
269
+ responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
270
+ responses = [response.split(template.sep.strip())[0].strip() for response in responses]
271
+ return responses
272
+
273
+ def chat(self, tokenizer, pixel_values, question, generation_config,
274
+ history=None, return_history=False, num_patches_list=None,
275
+ IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
276
+ IMG_CONTEXT_TOKEN='<IMG_CONTEXT>', verbose=False):
277
+
278
+ if history is None and pixel_values is not None and '<image>' not in question:
279
+ question = '<image>\n' + question
280
+
281
+ if num_patches_list is None:
282
+ num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
283
+ assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
284
+
285
+ img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
286
+ self.img_context_token_id = img_context_token_id
287
+
288
+ template = get_conv_template(self.template)
289
+ template.system_message = self.system_message
290
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
291
+
292
+ history = [] if history is None else history
293
+ for (old_question, old_answer) in history:
294
+ template.append_message(template.roles[0], old_question)
295
+ template.append_message(template.roles[1], old_answer)
296
+ template.append_message(template.roles[0], question)
297
+ template.append_message(template.roles[1], None)
298
+ query = template.get_prompt()
299
+
300
+ if verbose and pixel_values is not None:
301
+ image_bs = pixel_values.shape[0]
302
+ print(f'dynamic ViT batch size: {image_bs}')
303
+
304
+ for num_patches in num_patches_list:
305
+ image_tokens = (IMG_START_TOKEN
306
+ + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches
307
+ + IMG_END_TOKEN)
308
+ query = query.replace('<image>', image_tokens, 1)
309
+
310
+ model_inputs = tokenizer(query, return_tensors='pt')
311
+ input_ids = model_inputs['input_ids'].to(self.device)
312
+ attention_mask = model_inputs['attention_mask'].to(self.device)
313
+ generation_config['eos_token_id'] = eos_token_id
314
+ generation_output = self.generate(
315
+ pixel_values=pixel_values,
316
+ input_ids=input_ids,
317
+ attention_mask=attention_mask,
318
+ **generation_config
319
+ )
320
+ response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
321
+ response = response.split(template.sep.strip())[0].strip()
322
+ history.append((question, response))
323
+ if return_history:
324
+ return response, history
325
+ else:
326
+ query_to_print = query.replace(IMG_CONTEXT_TOKEN, '')
327
+ query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '<image>')
328
+ if verbose:
329
+ print(query_to_print, response)
330
+ return response
331
+
332
+ @torch.no_grad()
333
+ def generate(
334
+ self,
335
+ pixel_values: Optional[torch.FloatTensor] = None,
336
+ input_ids: Optional[torch.FloatTensor] = None,
337
+ attention_mask: Optional[torch.LongTensor] = None,
338
+ visual_features: Optional[torch.FloatTensor] = None,
339
+ generation_config: Optional[GenerationConfig] = None,
340
+ output_hidden_states: Optional[bool] = None,
341
+ **generate_kwargs,
342
+ ) -> torch.LongTensor:
343
+
344
+ assert self.img_context_token_id is not None
345
+ if pixel_values is not None:
346
+ if visual_features is not None:
347
+ vit_embeds = visual_features
348
+ else:
349
+ vit_embeds = self.extract_feature(pixel_values)
350
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
351
+ B, N, C = input_embeds.shape
352
+ input_embeds = input_embeds.reshape(B * N, C)
353
+
354
+ input_ids = input_ids.reshape(B * N)
355
+ selected = (input_ids == self.img_context_token_id)
356
+ assert selected.sum() != 0
357
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
358
+
359
+ input_embeds = input_embeds.reshape(B, N, C)
360
+ else:
361
+ input_embeds = self.language_model.get_input_embeddings()(input_ids)
362
+
363
+ outputs = self.language_model.generate(
364
+ inputs_embeds=input_embeds,
365
+ attention_mask=attention_mask,
366
+ generation_config=generation_config,
367
+ output_hidden_states=output_hidden_states,
368
+ use_cache=True,
369
+ **generate_kwargs,
370
+ )
371
+
372
+ return outputs
373
+
374
+ @property
375
+ def lm_head(self):
376
+ return self.language_model.get_output_embeddings()
377
+
378
+ def get_input_embeddings(self):
379
+ return self.language_model.get_input_embeddings()
380
+
381
+ def get_output_embeddings(self):
382
+ return self.language_model.get_output_embeddings()