root commited on
Commit
4e18cb4
·
1 Parent(s): a2af15a

update code

Browse files
configuration_yuanvl.py CHANGED
@@ -1,6 +1,6 @@
1
  # --------------------------------------------------------
2
  # InternVL
3
- # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
 
1
  # --------------------------------------------------------
2
  # InternVL
3
+ # Copyright (c) 2024 YuanLabAI
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
conversation.py CHANGED
@@ -391,7 +391,7 @@ register_conv_template(
391
  Conversation(
392
  name='yuan-chat',
393
  system_template='<|im_start|>system\n{system_message}',
394
- system_message='你是IEI-源多模态模型,英文名是YuanVL,是由浪潮信息开发的多模态大语言模型。',
395
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
396
  sep_style=SeparatorStyle.MPT,
397
  sep='<|im_end|>\n',
 
391
  Conversation(
392
  name='yuan-chat',
393
  system_template='<|im_start|>system\n{system_message}',
394
+ system_message='你是Yuan3.0 Flash多模态大模型,由YuanLab.ai 团队开发的多模态大语言模型。',
395
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
396
  sep_style=SeparatorStyle.MPT,
397
  sep='<|im_end|>\n',
modeling_yuanlm2.py CHANGED
@@ -1,5 +1,5 @@
1
  # coding=utf-8
2
- # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
  #
4
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
  # and OPT implementations in this library. It has been modified from its
@@ -35,7 +35,6 @@ from einops import rearrange
35
  #from apex.normalization import MixedFusedRMSNorm as RMSNorm
36
  #from flash_attn import flash_attn_func
37
  from transformer_engine.pytorch import RMSNorm
38
- import pdb
39
  import copy
40
  try:
41
  import grouped_gemm as gg
@@ -52,22 +51,6 @@ logger = logging.get_logger(__name__)
52
 
53
  _CONFIG_FOR_DOC = "YuanConfig"
54
 
55
- """
56
- class YuanRotaryEmbedding(nn.Module):
57
- def __init__(self, dim, base=10000, dtype=torch.float32, device=None, scaling_factor=1.0, rope_type='default'):
58
- super().__init__()
59
- inv_freq = (1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))).to(dtype)#.to('cuda:1')
60
- self.register_buffer('inv_freq', inv_freq)
61
-
62
- def forward(self, max_seq_len, offset=0):
63
- self.inv_freq = self.inv_freq.to(torch.float32)
64
- seq = torch.arange(max_seq_len, device=self.inv_freq.device) + offset
65
- freqs = einsum('i , j -> i j', seq.type_as(self.inv_freq), self.inv_freq)
66
- # first part even vector components, second part odd vector components,
67
- # 2 * dim in dimension size
68
- emb = torch.cat((freqs, freqs), dim=-1)
69
- # emb [seq_length, .., dim]
70
- return emb[:, None, None, :]"""
71
 
72
  class YuanRotaryEmbedding(nn.Module):
73
  def __init__(self, dim, base=10000, dtype=torch.float32, rotary_interleaved=False, seq_len_interpolation_factor=None):
@@ -125,17 +108,10 @@ class YuanRotaryEmbedding(nn.Module):
125
  )
126
  # emb [seq_length, .., dim]
127
  emb = emb[:, None, None, :]
128
- #emb = emb[:, None, :]
129
  return emb
130
 
131
 
132
  def _rotate_half(x, rotary_interleaved):
133
- """huggingface version
134
- change sign so the last dimension becomes [-odd, +even]
135
-
136
- x1, x2 = torch.chunk(x, 2, dim=-1)
137
- return torch.cat((-x2, x1), dim=-1)
138
- """
139
  if not rotary_interleaved:
140
  x1, x2 = torch.chunk(x, 2, dim=-1)
141
  return torch.cat((-x2, x1), dim=-1)
@@ -162,24 +138,6 @@ def apply_rotary_pos_emb(t, freqs, position_ids, rotary_interleaved=False):
162
 
163
  t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
164
  return torch.cat((t, t_pass), dim=-1)
165
- """huggingface version
166
- input tensor t is of shape [seq_length, ..., dim]
167
- rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
168
- check https://kexue.fm/archives/8265 for detailed formulas
169
-
170
- dtype = t.dtype
171
- rot_dim = freqs.shape[-1]
172
- t_pass = t[..., rot_dim:]
173
- if position_ids.shape[1] > 1:
174
- freqs = freqs[position_ids]
175
- freqs = freqs.view(t.shape[1],freqs.shape[1],freqs.shape[2],freqs.shape[4]).transpose(0,1)
176
- # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
177
- t = t[..., :rot_dim]
178
- # first part is cosine component
179
- # second part is sine component, need to change signs with _rotate_half method
180
- t = (t * freqs.cos()) + (_rotate_half(t) * freqs.sin())
181
- t = t.to(dtype)
182
- """
183
 
184
  return torch.cat((t, t_pass), dim=-1)
185
 
@@ -269,53 +227,6 @@ class LocalizedFiltering(torch.nn.Module):
269
  lf_output = self.output_layernorm(output2 + residual)
270
 
271
  return lf_output
272
- '''#IEIyuan huggingface version
273
- if before_hidden_states == None:
274
- inputs = inputs.transpose(0,1)
275
- seq_len, bsz, embed_dim = inputs.size()
276
- if embed_dim != self.embed_dim:
277
- raise ValueError(
278
- f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
279
- )
280
- residual = inputs
281
- inputs = inputs.view(seq_len, 1, bsz, embed_dim).permute(2, 3, 0, 1)
282
- inputs = torch.cat((torch.zeros(bsz, embed_dim, 1, 1, dtype=inputs.dtype, device=inputs.device), inputs), dim=2).contiguous()
283
- output1 = self.conv1(inputs)
284
-
285
- output1 = torch.cat((torch.zeros(bsz, embed_dim // 2, 1, 1, dtype=inputs.dtype, device=inputs.device), output1), dim=2).contiguous()
286
- output2 = self.conv2(output1).permute(2, 3, 0, 1).contiguous()
287
- output2 = output2.view(seq_len, bsz, embed_dim)
288
- assert output2.shape == residual.shape
289
- norm_input = (output2 + residual)#.to('cuda:0')
290
- torch.cuda.set_device(norm_input.device)
291
- lf_output = self.output_layernorm(norm_input)
292
- lf_output = lf_output#.to('cuda:1')
293
- lf_output = lf_output.transpose(0,1)
294
- return lf_output
295
- else:
296
- inputs = inputs.transpose(0,1)
297
- before_hidden_states = before_hidden_states.transpose(0,1)
298
- seq_len, bsz, embed_dim = inputs.size()
299
- if embed_dim != self.embed_dim:
300
- raise ValueError(
301
- f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
302
- )
303
- residual = inputs
304
- inputs = inputs.view(seq_len, 1, bsz, embed_dim).permute(2, 3, 0, 1)
305
- before_hidden_states = before_hidden_states.view(2, 1, bsz, embed_dim).permute(2, 3, 0, 1)
306
- inputs = torch.cat((before_hidden_states, inputs), dim=2).contiguous()
307
- output1 = self.conv1(inputs)
308
- output2 = self.conv2(output1).permute(2, 3, 0, 1).contiguous()
309
- output2 = output2.view(seq_len, bsz, embed_dim)
310
- assert output2.shape == residual.shape
311
-
312
- norm_input = (output2 + residual)#.to('cuda:0')
313
- torch.cuda.set_device(norm_input.device)
314
- lf_output = self.output_layernorm(norm_input)
315
- lf_output = lf_output#.to('cuda:1')
316
- lf_output = lf_output.transpose(0,1)
317
- return lf_output
318
- '''
319
 
320
 
321
  def forward(
@@ -427,8 +338,6 @@ class FlashSelfAttention(torch.nn.Module):
427
  # only on first autoregressive step q,k,v have same seqlen
428
  is_causal = seqlen_q == seqlen_k
429
  cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=q.device)
430
- #cu_seqlens_q = [cu_seqlens_q[0], cu_seqlens_q[-1]]
431
- #cu_seqlens_k = [cu_seqlens_k[0], cu_seqlens_k[-1]]
432
  dropout_p = 0
433
 
434
  output = flash_attn_unpadded_func(q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, dropout_p, softmax_scale=self.softmax_scale, causal=is_causal)
@@ -561,8 +470,6 @@ class YuanAttention(nn.Module):
561
  self.lf_gate = LocalizedFiltering(self.hidden_size, self.lf_conv2d_group, self.lf_conv2d_num_pad)
562
  self.get_query_key = nn.Linear(self.hidden_size, 2 * self.attention_projection_size, bias=False)
563
  self.core_attention = FlashSelfAttention(causal=True, attention_dropout=config.attn_dropout, softmax_scale=self.softmax_scale)
564
- #self.core_attention_flash = DotProductAttention(num_attention_heads=self.num_heads,
565
- # kv_channels=self.head_dim)
566
 
567
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
568
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -582,7 +489,6 @@ class YuanAttention(nn.Module):
582
  q_len, bsz, _ = hidden_states.size()
583
  hidden_states = hidden_states#.to('cuda:1')
584
  is_first_step = False
585
- import pdb
586
  if use_cache:
587
  if past_key_value is None:
588
  before_hidden_states = None
@@ -605,7 +511,6 @@ class YuanAttention(nn.Module):
605
  else:
606
  hidden_states = self.lf_gate(hidden_states, before_hidden_states)
607
  mixed_qk_layer = self.get_query_key(hidden_states)
608
- #mixed_qk_layer = torch.matmul(hidden_states, qk_tensor)
609
  new_tensor_shape = mixed_qk_layer.size()[:-1] + (self.num_heads, 2 * self.head_dim)
610
  mixed_qk_layer = mixed_qk_layer.view(*new_tensor_shape)
611
  (query_states, key_states) = torch.split(mixed_qk_layer, self.head_dim, dim=-1)
@@ -619,7 +524,6 @@ class YuanAttention(nn.Module):
619
  if rotary_pos_emb is not None:
620
  if position_ids.shape[1] == 1:
621
  q_seq_start = position_ids[0,-1]
622
- #seq_start = past_key_value[0].shape[0]
623
  q_seq_end = q_seq_start + 1
624
  k_seq_end = q_seq_end
625
  else:
@@ -633,26 +537,16 @@ class YuanAttention(nn.Module):
633
  else:
634
  rotary_pos_emb = ((rotary_pos_emb,) * 2)
635
  q_pos_emb, k_pos_emb = rotary_pos_emb
636
- #q_pos_emb = q_pos_emb[q_seq_start:q_seq_end]
637
- #k_pos_emb = k_pos_emb[:k_seq_end]
638
- #import pdb
639
- #pdb.set_trace()
640
  if past_key_value is not None:
641
  # reuse k, v, self_attention
642
  key_states = torch.cat([past_key_value[0], key_states], dim=0)
643
  value_states = torch.cat([past_key_value[1], value_states], dim=0)
644
  past_key_value = (key_states, value_states, inference_hidden_states_memory) if use_cache else None
645
- #query_states = apply_rotary_pos_emb(query_states.permute(1, 0, 2, 3), q_pos_emb, position_ids)
646
- #key_states = apply_rotary_pos_emb(key_states.permute(1, 0, 2, 3), k_pos_emb, position_ids)
647
  query_states = apply_rotary_pos_emb(query_states, q_pos_emb, position_ids)
648
  key_states = apply_rotary_pos_emb(key_states, k_pos_emb, position_ids_k)
649
 
650
  attn_weights = None
651
- #query_states = query_states.transpose(0,1)
652
- #key_states = key_states.transpose(0,1)
653
- #value_states = value_states
654
  attn_output = self.core_attention(query_states, key_states, value_states)
655
- #attn_output = self.core_attention(query_states, key_states, value_states, attention_mask)
656
  q_len, bsz, _, _ = attn_output.shape
657
  attn_output = attn_output.reshape(q_len, bsz, -1)
658
 
@@ -743,7 +637,6 @@ class GroupedMLP(nn.Module):
743
  return torch.nn.functional.silu(x[0]) * x[1]
744
 
745
  self.activation_func = glu
746
- #self.ffn_hidden_size = config.moe_config['ffn_hidden_size']
747
  self.ffn_hidden_size = config.ffn_hidden_size
748
  fc1_output_size_per_partition = self.ffn_hidden_size * 2
749
  fc2_input_size = self.ffn_hidden_size
@@ -753,10 +646,6 @@ class GroupedMLP(nn.Module):
753
  def forward(self, permuted_hidden_states, tokens_per_expert):
754
  torch.cuda.set_device(permuted_hidden_states.device)
755
  permuted_hidden_states = permuted_hidden_states#.to('cuda:0')
756
- #fc1_output = gg.ops.gmm(permuted_hidden_states, self.weight1, tokens_per_expert.cpu(), trans_b=False)
757
-
758
- #intermediate_parallel = self.activation_func(fc1_output)
759
- #fc2_output = gg.ops.gmm(intermediate_parallel, self.weight2, tokens_per_expert.cpu(), trans_b=False)
760
 
761
  fc2_outputs = []
762
  start_idx = 0
@@ -764,13 +653,10 @@ class GroupedMLP(nn.Module):
764
  if tokens_per_expert[i] == 0:
765
  continue
766
  end_idx = start_idx + tokens_per_expert[i]
767
- #fc1_output = torch.matmul(permuted_hidden_states[start_idx:end_idx], self.w1[i])
768
  # Use custom attributes for each expert's Linear layers
769
 
770
  fc1_output = self.w1[i](permuted_hidden_states[start_idx:end_idx])
771
- #print("shape1:", self.w1[i].shape, "shape2:", permuted_hidden_states[start_idx:end_idx].shape)
772
  intermediate_parallel = self.activation_func(fc1_output)
773
- #fc2_output = torch.matmul(intermediate_parallel, self.w2[i])
774
  fc2_output = self.w2[i](intermediate_parallel)
775
  fc2_outputs.append(fc2_output)
776
  start_idx = end_idx
@@ -788,7 +674,6 @@ class YuanMoeLayer(nn.Module):
788
 
789
  expert_indices_offset = (0)
790
 
791
- #self.gate = ParallelAttention_router(config)
792
  self.router = ParallelAttention_router(config)
793
  self.token_dispatcher = MoEDroplessTokenDispatcher(self.num_experts, config=self.config)
794
  self.experts = GroupedMLP(self.num_experts, self.config)
@@ -800,7 +685,6 @@ class YuanMoeLayer(nn.Module):
800
 
801
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
802
  batch_size, sequence_length, hidden_dim = hidden_states.shape
803
- #logits = self.gate(hidden_states)
804
  logits = self.router(hidden_states)
805
  scores, indices = self.routing(logits)
806
  scores = scores.to(hidden_states.dtype)
@@ -853,7 +737,6 @@ class YuanDecoderLayer(nn.Module):
853
  (see `past_key_values`).
854
  past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
855
  """
856
- import pdb
857
  residual = hidden_states#.to('cuda:1')
858
  torch.cuda.set_device(hidden_states.device)
859
  hidden_states = self.input_layernorm(hidden_states) #.to('cuda:0')).to('cuda:1')
@@ -870,9 +753,6 @@ class YuanDecoderLayer(nn.Module):
870
  use_cache=use_cache,
871
  )
872
 
873
- import pdb
874
- #print(hidden_states)
875
- #pdb.set_trace()
876
  hidden_states = residual + hidden_states.permute(1, 0, 2)
877
 
878
  # Fully Connected
@@ -1156,8 +1036,6 @@ class YuanModel(YuanPreTrainedModel):
1156
  past_key_values_length = 0
1157
 
1158
  if past_key_values is not None:
1159
- #past_key_values_length = past_key_values[0][0].shape[2]
1160
- #modify
1161
  past_key_values_length = past_key_values[0][0].shape[0]
1162
  seq_length_with_past = seq_length_with_past + past_key_values_length
1163
 
@@ -1170,13 +1048,8 @@ class YuanModel(YuanPreTrainedModel):
1170
  position_ids_k = torch.cat((position_ids, position_ids_k), dim=1)
1171
  position_ids = position_ids[:,-1]+past_key_values[0][0].shape[0]-position_ids.shape[1]+1
1172
  position_ids = position_ids.unsqueeze(0)
1173
- #print(position_ids_k,position_ids)
1174
- #print(position_ids_k.shape,position_ids.shape)
1175
  else:
1176
  position_ids_k = position_ids
1177
- #print(position_ids)
1178
- #import pdb
1179
- #pdb.set_trace()
1180
 
1181
  if position_ids is None:
1182
  device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1285,20 +1158,8 @@ class YuanModel(YuanPreTrainedModel):
1285
  class YuanForCausalLM(YuanPreTrainedModel):
1286
  def __init__(self, config):
1287
  super().__init__(config)
1288
- '''
1289
- self.eod_token = config.eod_token
1290
- self.sep_token = config.sep_token
1291
- self.use_loss_mask = config.use_loss_mask
1292
- self.model = YuanModel(config)
1293
-
1294
- self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1295
-
1296
- # Initialize weights and apply final processing
1297
- self.post_init()
1298
- '''
1299
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1300
  self.model = YuanModel(config)
1301
- #self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1302
  self.post_init()
1303
 
1304
  def get_input_embeddings(self):
 
1
  # coding=utf-8
2
+ # Copyright 2022 YuanLabAI and the HuggingFace Inc. team. All rights reserved.
3
  #
4
  # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
  # and OPT implementations in this library. It has been modified from its
 
35
  #from apex.normalization import MixedFusedRMSNorm as RMSNorm
36
  #from flash_attn import flash_attn_func
37
  from transformer_engine.pytorch import RMSNorm
 
38
  import copy
39
  try:
40
  import grouped_gemm as gg
 
51
 
52
  _CONFIG_FOR_DOC = "YuanConfig"
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  class YuanRotaryEmbedding(nn.Module):
56
  def __init__(self, dim, base=10000, dtype=torch.float32, rotary_interleaved=False, seq_len_interpolation_factor=None):
 
108
  )
109
  # emb [seq_length, .., dim]
110
  emb = emb[:, None, None, :]
 
111
  return emb
112
 
113
 
114
  def _rotate_half(x, rotary_interleaved):
 
 
 
 
 
 
115
  if not rotary_interleaved:
116
  x1, x2 = torch.chunk(x, 2, dim=-1)
117
  return torch.cat((-x2, x1), dim=-1)
 
138
 
139
  t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
140
  return torch.cat((t, t_pass), dim=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  return torch.cat((t, t_pass), dim=-1)
143
 
 
227
  lf_output = self.output_layernorm(output2 + residual)
228
 
229
  return lf_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
 
232
  def forward(
 
338
  # only on first autoregressive step q,k,v have same seqlen
339
  is_causal = seqlen_q == seqlen_k
340
  cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=q.device)
 
 
341
  dropout_p = 0
342
 
343
  output = flash_attn_unpadded_func(q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, dropout_p, softmax_scale=self.softmax_scale, causal=is_causal)
 
470
  self.lf_gate = LocalizedFiltering(self.hidden_size, self.lf_conv2d_group, self.lf_conv2d_num_pad)
471
  self.get_query_key = nn.Linear(self.hidden_size, 2 * self.attention_projection_size, bias=False)
472
  self.core_attention = FlashSelfAttention(causal=True, attention_dropout=config.attn_dropout, softmax_scale=self.softmax_scale)
 
 
473
 
474
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
475
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
489
  q_len, bsz, _ = hidden_states.size()
490
  hidden_states = hidden_states#.to('cuda:1')
491
  is_first_step = False
 
492
  if use_cache:
493
  if past_key_value is None:
494
  before_hidden_states = None
 
511
  else:
512
  hidden_states = self.lf_gate(hidden_states, before_hidden_states)
513
  mixed_qk_layer = self.get_query_key(hidden_states)
 
514
  new_tensor_shape = mixed_qk_layer.size()[:-1] + (self.num_heads, 2 * self.head_dim)
515
  mixed_qk_layer = mixed_qk_layer.view(*new_tensor_shape)
516
  (query_states, key_states) = torch.split(mixed_qk_layer, self.head_dim, dim=-1)
 
524
  if rotary_pos_emb is not None:
525
  if position_ids.shape[1] == 1:
526
  q_seq_start = position_ids[0,-1]
 
527
  q_seq_end = q_seq_start + 1
528
  k_seq_end = q_seq_end
529
  else:
 
537
  else:
538
  rotary_pos_emb = ((rotary_pos_emb,) * 2)
539
  q_pos_emb, k_pos_emb = rotary_pos_emb
 
 
 
 
540
  if past_key_value is not None:
541
  # reuse k, v, self_attention
542
  key_states = torch.cat([past_key_value[0], key_states], dim=0)
543
  value_states = torch.cat([past_key_value[1], value_states], dim=0)
544
  past_key_value = (key_states, value_states, inference_hidden_states_memory) if use_cache else None
 
 
545
  query_states = apply_rotary_pos_emb(query_states, q_pos_emb, position_ids)
546
  key_states = apply_rotary_pos_emb(key_states, k_pos_emb, position_ids_k)
547
 
548
  attn_weights = None
 
 
 
549
  attn_output = self.core_attention(query_states, key_states, value_states)
 
550
  q_len, bsz, _, _ = attn_output.shape
551
  attn_output = attn_output.reshape(q_len, bsz, -1)
552
 
 
637
  return torch.nn.functional.silu(x[0]) * x[1]
638
 
639
  self.activation_func = glu
 
640
  self.ffn_hidden_size = config.ffn_hidden_size
641
  fc1_output_size_per_partition = self.ffn_hidden_size * 2
642
  fc2_input_size = self.ffn_hidden_size
 
646
  def forward(self, permuted_hidden_states, tokens_per_expert):
647
  torch.cuda.set_device(permuted_hidden_states.device)
648
  permuted_hidden_states = permuted_hidden_states#.to('cuda:0')
 
 
 
 
649
 
650
  fc2_outputs = []
651
  start_idx = 0
 
653
  if tokens_per_expert[i] == 0:
654
  continue
655
  end_idx = start_idx + tokens_per_expert[i]
 
656
  # Use custom attributes for each expert's Linear layers
657
 
658
  fc1_output = self.w1[i](permuted_hidden_states[start_idx:end_idx])
 
659
  intermediate_parallel = self.activation_func(fc1_output)
 
660
  fc2_output = self.w2[i](intermediate_parallel)
661
  fc2_outputs.append(fc2_output)
662
  start_idx = end_idx
 
674
 
675
  expert_indices_offset = (0)
676
 
 
677
  self.router = ParallelAttention_router(config)
678
  self.token_dispatcher = MoEDroplessTokenDispatcher(self.num_experts, config=self.config)
679
  self.experts = GroupedMLP(self.num_experts, self.config)
 
685
 
686
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
687
  batch_size, sequence_length, hidden_dim = hidden_states.shape
 
688
  logits = self.router(hidden_states)
689
  scores, indices = self.routing(logits)
690
  scores = scores.to(hidden_states.dtype)
 
737
  (see `past_key_values`).
738
  past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
739
  """
 
740
  residual = hidden_states#.to('cuda:1')
741
  torch.cuda.set_device(hidden_states.device)
742
  hidden_states = self.input_layernorm(hidden_states) #.to('cuda:0')).to('cuda:1')
 
753
  use_cache=use_cache,
754
  )
755
 
 
 
 
756
  hidden_states = residual + hidden_states.permute(1, 0, 2)
757
 
758
  # Fully Connected
 
1036
  past_key_values_length = 0
1037
 
1038
  if past_key_values is not None:
 
 
1039
  past_key_values_length = past_key_values[0][0].shape[0]
1040
  seq_length_with_past = seq_length_with_past + past_key_values_length
1041
 
 
1048
  position_ids_k = torch.cat((position_ids, position_ids_k), dim=1)
1049
  position_ids = position_ids[:,-1]+past_key_values[0][0].shape[0]-position_ids.shape[1]+1
1050
  position_ids = position_ids.unsqueeze(0)
 
 
1051
  else:
1052
  position_ids_k = position_ids
 
 
 
1053
 
1054
  if position_ids is None:
1055
  device = input_ids.device if input_ids is not None else inputs_embeds.device
 
1158
  class YuanForCausalLM(YuanPreTrainedModel):
1159
  def __init__(self, config):
1160
  super().__init__(config)
 
 
 
 
 
 
 
 
 
 
 
1161
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1162
  self.model = YuanModel(config)
 
1163
  self.post_init()
1164
 
1165
  def get_input_embeddings(self):
modeling_yuanvl_chat.py CHANGED
@@ -1,6 +1,6 @@
1
  # --------------------------------------------------------
2
  # YuanVL
3
- # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
@@ -353,11 +353,6 @@ class YuanVLChatModel(PreTrainedModel):
353
  # vit_embeds: (imbs * num_images, h*w, vit_dim)
354
  vit_embeds = vit_embeds[:, 1:, :]
355
 
356
- '''h = w = int(vit_embeds.shape[1]**0.5)
357
- # vit_embeds: (imbs * num_images, vit_dim, h, w)
358
- vit_embeds = vit_embeds.view(vit_embeds.shape[0], h, w, -1)
359
- vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio)
360
- '''
361
  pn, phw, pc = vit_embeds.shape
362
  ph = pw = int(phw**0.5)
363
  vit_embeds = vit_embeds.view(pn, ph, pw, pc).permute(0, 3, 1, 2)
 
1
  # --------------------------------------------------------
2
  # YuanVL
3
+ # Copyright (c) 2024 YuanLabAI
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
 
 
353
  # vit_embeds: (imbs * num_images, h*w, vit_dim)
354
  vit_embeds = vit_embeds[:, 1:, :]
355
 
 
 
 
 
 
356
  pn, phw, pc = vit_embeds.shape
357
  ph = pw = int(phw**0.5)
358
  vit_embeds = vit_embeds.view(pn, ph, pw, pc).permute(0, 3, 1, 2)