danielhanchen commited on
Commit
ced432c
·
verified ·
1 Parent(s): 1ca8320

Upload folder using huggingface_hub

Browse files
modeling_deepseekocr2.py CHANGED
@@ -1,36 +1,29 @@
1
- import os
2
- import math
3
- import re
4
- from tqdm import tqdm
5
- from abc import ABC
6
  from typing import List, Optional, Tuple, Union
7
-
8
- from addict import Dict
9
  from PIL import Image, ImageOps, ImageDraw, ImageFont
10
- import numpy as np
11
-
12
  import torch
13
  import torch.nn as nn
14
  from torch.nn import CrossEntropyLoss
15
  from torchvision import transforms
16
-
17
- from transformers.cache_utils import Cache
18
- from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
19
- from transformers import DeepseekV2Model, DeepseekV2ForCausalLM
20
- from transformers import DeepseekV2Config
21
- from transformers.models.deepseek_v2.modeling_deepseek_v2 import (
22
- DeepseekV2Attention,
23
- DeepseekV2MLP,
24
- DeepseekV2MoE,
25
- DeepseekV2RMSNorm,
26
- DeepseekV2DecoderLayer,
27
- )
28
- from transformers.models.llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding
29
- from transformers import TextStreamer
30
  from .deepencoderv2 import build_sam_vit_b, build_qwen2_decoder_as_encoder, MlpProjector
 
 
31
  from .conversation import get_conv_template
 
 
 
 
 
 
 
32
 
33
- torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
34
 
35
  def load_image(image_path):
36
 
@@ -355,22 +348,6 @@ class NoEOSTextStreamer(TextStreamer):
355
  text = text.replace(eos_text, "\n")
356
  print(text, flush=True, end="")
357
 
358
- def decoder_layer_init(self, config: DeepseekV2Config, layer_idx: int):
359
- nn.Module.__init__(self)
360
- self.hidden_size = config.hidden_size
361
-
362
- if config.use_mla:
363
- self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx)
364
- else:
365
- config.head_dim = config.hidden_size // config.num_attention_heads
366
- self.self_attn = LlamaAttention(config, layer_idx)
367
- self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config)
368
-
369
- self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
370
- self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
371
-
372
-
373
- DeepseekV2DecoderLayer.__init__ = decoder_layer_init
374
 
375
  class DeepseekOCR2Config(DeepseekV2Config):
376
  model_type = "DeepseekOCR2"
@@ -389,7 +366,8 @@ class DeepseekOCR2Model(DeepseekV2Model):
389
  embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
390
  # self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
391
  self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
392
- self.rotary_emb = LlamaRotaryEmbedding(config=config)
 
393
 
394
 
395
  def forward(
@@ -408,15 +386,21 @@ class DeepseekOCR2Model(DeepseekV2Model):
408
  return_dict: Optional[bool] = None,
409
  ) -> Union[Tuple, BaseModelOutputWithPast]:
410
 
 
 
 
411
  if inputs_embeds is None:
412
  # inputs_embeds = self.embed_tokens(input_ids)
413
  inputs_embeds = self.get_input_embeddings()(input_ids)
414
- inputs_embeds = inputs_embeds.clone()
 
415
 
416
  sam_model = getattr(self, 'sam_model', None)
417
  # sam_model = self.sam_model
418
  qwen2_model = getattr(self, 'qwen2_model', None)
419
 
 
 
420
  if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
421
 
422
  idx = 0
@@ -449,10 +433,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
449
  global_features = global_features_2
450
  global_features = self.projector(global_features)
451
 
452
- # print('=====================')
453
- # print('BASE: ', global_features.shape)
454
- # print('PATCHES: ', local_features.shape)
455
- # print('=====================')
456
 
457
  _, hw, n_dim = global_features.shape
458
  # h = w = int(hw ** 0.5)
@@ -481,10 +465,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
481
  global_features_2 = qwen2_model(global_features_1)
482
  global_features = global_features_2
483
  global_features = self.projector(global_features)
484
- # print('=====================')
485
- # print('BASE: ', global_features.shape)
486
- # print('NO PATCHES')
487
- # print('=====================')
488
  _, hw, n_dim = global_features.shape
489
  # h = w = int(hw ** 0.5)
490
 
@@ -508,16 +492,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
508
  images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
509
  # exit()
510
 
511
- # inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
512
- images_in_this_batch = images_in_this_batch.to(
513
- device=inputs_embeds.device, dtype=inputs_embeds.dtype
514
- )
515
- mask = images_seq_mask[idx].unsqueeze(-1).to(inputs_embeds.device) # bool [T, 1]
516
- updated_row = inputs_embeds[idx].masked_scatter(mask, images_in_this_batch)
517
- inputs_embeds[idx] = updated_row
518
 
519
  idx += 1
520
-
521
 
522
  return super(DeepseekOCR2Model, self).forward(
523
  input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
@@ -634,8 +612,8 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
634
  if past_key_values is not None:
635
  if isinstance(past_key_values, Cache):
636
  cache_length = past_key_values.get_seq_length()
637
- past_length = past_key_values.get_seq_length()
638
- max_cache_length = None
639
  else:
640
  cache_length = past_length = past_key_values[0][0].shape[2]
641
  max_cache_length = None
@@ -811,9 +789,9 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
811
 
812
 
813
 
814
- images_list.append(image_transform(global_view).to(torch_dtype))
815
 
816
- # global_view_tensor = image_transform(global_view).to(torch_dtype)
817
 
818
  width_crop_num, height_crop_num = crop_ratio
819
 
@@ -824,7 +802,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
824
  """process the local views"""
825
 
826
  for i in range(len(images_crop_raw)):
827
- images_crop_list.append(image_transform(images_crop_raw[i]).to(torch_dtype))
828
 
829
  if image_size == 768:
830
  valid_img_tokens += len(images_crop_list) * 144
@@ -858,7 +836,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
858
  # else:
859
  global_view = ImageOps.pad(image, (image_size, image_size),
860
  color=tuple(int(x * 255) for x in image_transform.mean))
861
- images_list.append(image_transform(global_view).to(torch_dtype))
862
 
863
  if base_size == 1024:
864
  valid_img_tokens += int(256 * ratio)
@@ -925,7 +903,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
925
 
926
  if not eval_mode:
927
  streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
928
- with torch.autocast("cuda", dtype=torch_dtype):
929
  with torch.no_grad():
930
  output_ids = self.generate(
931
  input_ids.unsqueeze(0).cuda(),
@@ -943,7 +921,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
943
  )
944
 
945
  else:
946
- with torch.autocast("cuda", dtype=torch_dtype):
947
  with torch.no_grad():
948
  output_ids = self.generate(
949
  input_ids.unsqueeze(0).cuda(),
 
1
+ from .modeling_deepseekv2 import DeepseekV2Model, DeepseekV2ForCausalLM
2
+ from .configuration_deepseek_v2 import DeepseekV2Config
3
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 
 
4
  from typing import List, Optional, Tuple, Union
5
+ from transformers.cache_utils import Cache
6
+ import requests
7
  from PIL import Image, ImageOps, ImageDraw, ImageFont
8
+ from io import BytesIO
 
9
  import torch
10
  import torch.nn as nn
11
  from torch.nn import CrossEntropyLoss
12
  from torchvision import transforms
13
+ # from torchvision.transforms.functional import InterpolationMode
14
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
15
  from .deepencoderv2 import build_sam_vit_b, build_qwen2_decoder_as_encoder, MlpProjector
16
+ from addict import Dict
17
+ from transformers import TextStreamer
18
  from .conversation import get_conv_template
19
+ from abc import ABC
20
+ import math
21
+ import re
22
+ from tqdm import tqdm
23
+ import numpy as np
24
+ # import time
25
+
26
 
 
27
 
28
  def load_image(image_path):
29
 
 
348
  text = text.replace(eos_text, "\n")
349
  print(text, flush=True, end="")
350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
  class DeepseekOCR2Config(DeepseekV2Config):
353
  model_type = "DeepseekOCR2"
 
366
  embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
367
  # self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
368
  self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
369
+
370
+
371
 
372
 
373
  def forward(
 
386
  return_dict: Optional[bool] = None,
387
  ) -> Union[Tuple, BaseModelOutputWithPast]:
388
 
389
+
390
+
391
+
392
  if inputs_embeds is None:
393
  # inputs_embeds = self.embed_tokens(input_ids)
394
  inputs_embeds = self.get_input_embeddings()(input_ids)
395
+
396
+
397
 
398
  sam_model = getattr(self, 'sam_model', None)
399
  # sam_model = self.sam_model
400
  qwen2_model = getattr(self, 'qwen2_model', None)
401
 
402
+
403
+
404
  if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
405
 
406
  idx = 0
 
433
  global_features = global_features_2
434
  global_features = self.projector(global_features)
435
 
436
+ print('=====================')
437
+ print('BASE: ', global_features.shape)
438
+ print('PATCHES: ', local_features.shape)
439
+ print('=====================')
440
 
441
  _, hw, n_dim = global_features.shape
442
  # h = w = int(hw ** 0.5)
 
465
  global_features_2 = qwen2_model(global_features_1)
466
  global_features = global_features_2
467
  global_features = self.projector(global_features)
468
+ print('=====================')
469
+ print('BASE: ', global_features.shape)
470
+ print('NO PATCHES')
471
+ print('=====================')
472
  _, hw, n_dim = global_features.shape
473
  # h = w = int(hw ** 0.5)
474
 
 
492
  images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
493
  # exit()
494
 
495
+ inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
 
 
 
 
 
 
496
 
497
  idx += 1
498
+
499
 
500
  return super(DeepseekOCR2Model, self).forward(
501
  input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
 
612
  if past_key_values is not None:
613
  if isinstance(past_key_values, Cache):
614
  cache_length = past_key_values.get_seq_length()
615
+ past_length = past_key_values.seen_tokens
616
+ max_cache_length = past_key_values.get_max_length()
617
  else:
618
  cache_length = past_length = past_key_values[0][0].shape[2]
619
  max_cache_length = None
 
789
 
790
 
791
 
792
+ images_list.append(image_transform(global_view).to(torch.bfloat16))
793
 
794
+ # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
795
 
796
  width_crop_num, height_crop_num = crop_ratio
797
 
 
802
  """process the local views"""
803
 
804
  for i in range(len(images_crop_raw)):
805
+ images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
806
 
807
  if image_size == 768:
808
  valid_img_tokens += len(images_crop_list) * 144
 
836
  # else:
837
  global_view = ImageOps.pad(image, (image_size, image_size),
838
  color=tuple(int(x * 255) for x in image_transform.mean))
839
+ images_list.append(image_transform(global_view).to(torch.bfloat16))
840
 
841
  if base_size == 1024:
842
  valid_img_tokens += int(256 * ratio)
 
903
 
904
  if not eval_mode:
905
  streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
906
+ with torch.autocast("cuda", dtype=torch.bfloat16):
907
  with torch.no_grad():
908
  output_ids = self.generate(
909
  input_ids.unsqueeze(0).cuda(),
 
921
  )
922
 
923
  else:
924
+ with torch.autocast("cuda", dtype=torch.bfloat16):
925
  with torch.no_grad():
926
  output_ids = self.generate(
927
  input_ids.unsqueeze(0).cuda(),
modeling_deepseekv2.py CHANGED
@@ -34,14 +34,10 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
34
  from transformers.activations import ACT2FN
35
  from transformers.cache_utils import Cache, DynamicCache
36
  from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
37
- try:
38
- from transformers.models.llama.modeling_llama import LlamaAttention
39
- except:
40
- LlamaAttention = None
41
- try:
42
- from transformers.models.llama.modeling_llama import LlamaFlashAttention2
43
- except:
44
- LlamaFlashAttention2 = None
45
  from transformers.modeling_outputs import (
46
  BaseModelOutputWithPast,
47
  CausalLMOutputWithPast,
 
34
  from transformers.activations import ACT2FN
35
  from transformers.cache_utils import Cache, DynamicCache
36
  from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
37
+ from transformers.models.llama.modeling_llama import (
38
+ LlamaAttention,
39
+ LlamaFlashAttention2
40
+ )
 
 
 
 
41
  from transformers.modeling_outputs import (
42
  BaseModelOutputWithPast,
43
  CausalLMOutputWithPast,
special_tokens_map.json CHANGED
@@ -1,7 +1,19 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|User|>",
4
- "<|Assistant|>"
 
 
 
 
 
 
 
 
 
 
 
 
5
  ],
6
  "bos_token": {
7
  "content": "<|begin▁of▁sentence|>",
 
1
  {
2
  "additional_special_tokens": [
3
+ {
4
+ "content": "<|User|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|Assistant|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ }
17
  ],
18
  "bos_token": {
19
  "content": "<|begin▁of▁sentence|>",
tokenizer_config.json CHANGED
@@ -6658,4 +6658,4 @@
6658
  "tokenizer_class": "LlamaTokenizerFast",
6659
  "unk_token": null,
6660
  "use_default_system_prompt": false
6661
- }
 
6658
  "tokenizer_class": "LlamaTokenizerFast",
6659
  "unk_token": null,
6660
  "use_default_system_prompt": false
6661
+ }