Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

modeling_deepseekocr2.py +44 -66
modeling_deepseekv2.py +4 -8
special_tokens_map.json +14 -2
tokenizer_config.json +1 -1

modeling_deepseekocr2.py CHANGED Viewed

@@ -1,36 +1,29 @@
-import os
-import math
-import re
-from tqdm import tqdm
-from abc import ABC
 from typing import List, Optional, Tuple, Union
-from addict import Dict
 from PIL import Image, ImageOps, ImageDraw, ImageFont
-import numpy as np
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torchvision import transforms
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers import DeepseekV2Model, DeepseekV2ForCausalLM
-from transformers import DeepseekV2Config
-from transformers.models.deepseek_v2.modeling_deepseek_v2 import (
-    DeepseekV2Attention,
-    DeepseekV2MLP,
-    DeepseekV2MoE,
-    DeepseekV2RMSNorm,
-    DeepseekV2DecoderLayer,
-)
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding
-from transformers import TextStreamer
 from .deepencoderv2 import build_sam_vit_b, build_qwen2_decoder_as_encoder, MlpProjector
 from .conversation import get_conv_template
-torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
 def load_image(image_path):
@@ -355,22 +348,6 @@ class NoEOSTextStreamer(TextStreamer):
         text = text.replace(eos_text, "\n")
         print(text, flush=True, end="")
-def decoder_layer_init(self, config: DeepseekV2Config, layer_idx: int):
-    nn.Module.__init__(self)
-    self.hidden_size = config.hidden_size
-    if config.use_mla:
-        self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx)
-    else:
-        config.head_dim = config.hidden_size // config.num_attention_heads
-        self.self_attn = LlamaAttention(config, layer_idx)
-    self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config)
-    self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-    self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-DeepseekV2DecoderLayer.__init__ = decoder_layer_init
 class DeepseekOCR2Config(DeepseekV2Config):
     model_type = "DeepseekOCR2"
@@ -389,7 +366,8 @@ class DeepseekOCR2Model(DeepseekV2Model):
         embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
         # self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
         self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
-        self.rotary_emb = LlamaRotaryEmbedding(config=config)
     def forward(
@@ -408,15 +386,21 @@ class DeepseekOCR2Model(DeepseekV2Model):
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         if inputs_embeds is None:
             # inputs_embeds = self.embed_tokens(input_ids)
             inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = inputs_embeds.clone()
         sam_model = getattr(self, 'sam_model', None)
         # sam_model = self.sam_model
         qwen2_model = getattr(self, 'qwen2_model', None)
         if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
             idx = 0
@@ -449,10 +433,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
                         global_features = global_features_2
                         global_features = self.projector(global_features)
-                        # print('=====================')
-                        # print('BASE: ', global_features.shape)
-                        # print('PATCHES: ', local_features.shape)
-                        # print('=====================')
                         _, hw, n_dim = global_features.shape
                         # h = w = int(hw ** 0.5)
@@ -481,10 +465,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
                         global_features_2 = qwen2_model(global_features_1)
                         global_features = global_features_2
                         global_features = self.projector(global_features)
-                        # print('=====================')
-                        # print('BASE: ', global_features.shape)
-                        # print('NO PATCHES')
-                        # print('=====================')
                         _, hw, n_dim = global_features.shape
                         # h = w = int(hw ** 0.5)
@@ -508,16 +492,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
-                    # inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
-                    images_in_this_batch = images_in_this_batch.to(
-                        device=inputs_embeds.device, dtype=inputs_embeds.dtype
-                    )
-                    mask = images_seq_mask[idx].unsqueeze(-1).to(inputs_embeds.device)   # bool [T, 1]
-                    updated_row = inputs_embeds[idx].masked_scatter(mask, images_in_this_batch)
-                    inputs_embeds[idx] = updated_row
                 idx += 1
         return super(DeepseekOCR2Model, self).forward(
             input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
@@ -634,8 +612,8 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.get_seq_length()
-                max_cache_length = None
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
@@ -811,9 +789,9 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
-                images_list.append(image_transform(global_view).to(torch_dtype))
-                # global_view_tensor = image_transform(global_view).to(torch_dtype)
                 width_crop_num, height_crop_num = crop_ratio
@@ -824,7 +802,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
                     """process the local views"""
                     for i in range(len(images_crop_raw)):
-                        images_crop_list.append(image_transform(images_crop_raw[i]).to(torch_dtype))
                 if image_size == 768:
                     valid_img_tokens += len(images_crop_list) * 144
@@ -858,7 +836,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
                 # else:
                 global_view = ImageOps.pad(image, (image_size, image_size),
                                         color=tuple(int(x * 255) for x in image_transform.mean))
-                images_list.append(image_transform(global_view).to(torch_dtype))
                 if base_size == 1024:
                     valid_img_tokens += int(256 * ratio)
@@ -925,7 +903,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
-            with torch.autocast("cuda", dtype=torch_dtype):
                 with torch.no_grad():
                     output_ids = self.generate(
                         input_ids.unsqueeze(0).cuda(),
@@ -943,7 +921,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
                         )
         else:
-            with torch.autocast("cuda", dtype=torch_dtype):
                 with torch.no_grad():
                     output_ids = self.generate(
                         input_ids.unsqueeze(0).cuda(),

+from .modeling_deepseekv2 import DeepseekV2Model, DeepseekV2ForCausalLM
+from .configuration_deepseek_v2 import DeepseekV2Config
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from typing import List, Optional, Tuple, Union
+from transformers.cache_utils import Cache
+import requests
 from PIL import Image, ImageOps, ImageDraw, ImageFont
+from io import BytesIO
 import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 from torchvision import transforms
+# from torchvision.transforms.functional import InterpolationMode
+import os
 from .deepencoderv2 import build_sam_vit_b, build_qwen2_decoder_as_encoder, MlpProjector
+from addict import Dict
+from transformers import TextStreamer
 from .conversation import get_conv_template
+from abc import ABC
+import math
+import re
+from tqdm import tqdm
+import numpy as np
+# import time
 def load_image(image_path):
         text = text.replace(eos_text, "\n")
         print(text, flush=True, end="")
 class DeepseekOCR2Config(DeepseekV2Config):
     model_type = "DeepseekOCR2"
         embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
         # self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
         self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
     def forward(
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         if inputs_embeds is None:
             # inputs_embeds = self.embed_tokens(input_ids)
             inputs_embeds = self.get_input_embeddings()(input_ids)
         sam_model = getattr(self, 'sam_model', None)
         # sam_model = self.sam_model
         qwen2_model = getattr(self, 'qwen2_model', None)
         if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
             idx = 0
                         global_features = global_features_2
                         global_features = self.projector(global_features)
+                        print('=====================')
+                        print('BASE: ', global_features.shape)
+                        print('PATCHES: ', local_features.shape)
+                        print('=====================')
                         _, hw, n_dim = global_features.shape
                         # h = w = int(hw ** 0.5)
                         global_features_2 = qwen2_model(global_features_1)
                         global_features = global_features_2
                         global_features = self.projector(global_features)
+                        print('=====================')
+                        print('BASE: ', global_features.shape)
+                        print('NO PATCHES')
+                        print('=====================')
                         _, hw, n_dim = global_features.shape
                         # h = w = int(hw ** 0.5)
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
+                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
                 idx += 1
         return super(DeepseekOCR2Model, self).forward(
             input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
+                images_list.append(image_transform(global_view).to(torch.bfloat16))
+                # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
                 width_crop_num, height_crop_num = crop_ratio
                     """process the local views"""
                     for i in range(len(images_crop_raw)):
+                        images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
                 if image_size == 768:
                     valid_img_tokens += len(images_crop_list) * 144
                 # else:
                 global_view = ImageOps.pad(image, (image_size, image_size),
                                         color=tuple(int(x * 255) for x in image_transform.mean))
+                images_list.append(image_transform(global_view).to(torch.bfloat16))
                 if base_size == 1024:
                     valid_img_tokens += int(256 * ratio)
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
+            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
                         input_ids.unsqueeze(0).cuda(),
                         )
         else:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
                         input_ids.unsqueeze(0).cuda(),

modeling_deepseekv2.py CHANGED Viewed

@@ -34,14 +34,10 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-try:
-    from transformers.models.llama.modeling_llama import LlamaAttention
-except:
-    LlamaAttention = None
-try:
-    from transformers.models.llama.modeling_llama import LlamaFlashAttention2
-except:
-    LlamaFlashAttention2 = None
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,

 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.models.llama.modeling_llama import (
+    LlamaAttention,
+    LlamaFlashAttention2
+)
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,

special_tokens_map.json CHANGED Viewed

@@ -1,7 +1,19 @@
 {
   "additional_special_tokens": [
-    "<|User|>",
-    "<|Assistant|>"
   ],
   "bos_token": {
     "content": "<｜begin▁of▁sentence｜>",

 {
   "additional_special_tokens": [
+    {
+      "content": "<|User|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|Assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
   ],
   "bos_token": {
     "content": "<｜begin▁of▁sentence｜>",

tokenizer_config.json CHANGED Viewed

@@ -6658,4 +6658,4 @@
   "tokenizer_class": "LlamaTokenizerFast",
   "unk_token": null,
   "use_default_system_prompt": false
-}

   "tokenizer_class": "LlamaTokenizerFast",
   "unk_token": null,
   "use_default_system_prompt": false
+}