Upload folder using huggingface_hub
Browse files- modeling_deepseekocr2.py +44 -66
- modeling_deepseekv2.py +4 -8
- special_tokens_map.json +14 -2
- tokenizer_config.json +1 -1
modeling_deepseekocr2.py
CHANGED
|
@@ -1,36 +1,29 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
from tqdm import tqdm
|
| 5 |
-
from abc import ABC
|
| 6 |
from typing import List, Optional, Tuple, Union
|
| 7 |
-
|
| 8 |
-
|
| 9 |
from PIL import Image, ImageOps, ImageDraw, ImageFont
|
| 10 |
-
|
| 11 |
-
|
| 12 |
import torch
|
| 13 |
import torch.nn as nn
|
| 14 |
from torch.nn import CrossEntropyLoss
|
| 15 |
from torchvision import transforms
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
| 19 |
-
from transformers import DeepseekV2Model, DeepseekV2ForCausalLM
|
| 20 |
-
from transformers import DeepseekV2Config
|
| 21 |
-
from transformers.models.deepseek_v2.modeling_deepseek_v2 import (
|
| 22 |
-
DeepseekV2Attention,
|
| 23 |
-
DeepseekV2MLP,
|
| 24 |
-
DeepseekV2MoE,
|
| 25 |
-
DeepseekV2RMSNorm,
|
| 26 |
-
DeepseekV2DecoderLayer,
|
| 27 |
-
)
|
| 28 |
-
from transformers.models.llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding
|
| 29 |
-
from transformers import TextStreamer
|
| 30 |
from .deepencoderv2 import build_sam_vit_b, build_qwen2_decoder_as_encoder, MlpProjector
|
|
|
|
|
|
|
| 31 |
from .conversation import get_conv_template
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
| 34 |
|
| 35 |
def load_image(image_path):
|
| 36 |
|
|
@@ -355,22 +348,6 @@ class NoEOSTextStreamer(TextStreamer):
|
|
| 355 |
text = text.replace(eos_text, "\n")
|
| 356 |
print(text, flush=True, end="")
|
| 357 |
|
| 358 |
-
def decoder_layer_init(self, config: DeepseekV2Config, layer_idx: int):
|
| 359 |
-
nn.Module.__init__(self)
|
| 360 |
-
self.hidden_size = config.hidden_size
|
| 361 |
-
|
| 362 |
-
if config.use_mla:
|
| 363 |
-
self.self_attn = DeepseekV2Attention(config=config, layer_idx=layer_idx)
|
| 364 |
-
else:
|
| 365 |
-
config.head_dim = config.hidden_size // config.num_attention_heads
|
| 366 |
-
self.self_attn = LlamaAttention(config, layer_idx)
|
| 367 |
-
self.mlp = DeepseekV2MoE(config) if layer_idx >= config.first_k_dense_replace else DeepseekV2MLP(config)
|
| 368 |
-
|
| 369 |
-
self.input_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
| 370 |
-
self.post_attention_layernorm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
DeepseekV2DecoderLayer.__init__ = decoder_layer_init
|
| 374 |
|
| 375 |
class DeepseekOCR2Config(DeepseekV2Config):
|
| 376 |
model_type = "DeepseekOCR2"
|
|
@@ -389,7 +366,8 @@ class DeepseekOCR2Model(DeepseekV2Model):
|
|
| 389 |
embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
|
| 390 |
# self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
|
| 391 |
self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
|
| 392 |
-
|
|
|
|
| 393 |
|
| 394 |
|
| 395 |
def forward(
|
|
@@ -408,15 +386,21 @@ class DeepseekOCR2Model(DeepseekV2Model):
|
|
| 408 |
return_dict: Optional[bool] = None,
|
| 409 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
| 410 |
|
|
|
|
|
|
|
|
|
|
| 411 |
if inputs_embeds is None:
|
| 412 |
# inputs_embeds = self.embed_tokens(input_ids)
|
| 413 |
inputs_embeds = self.get_input_embeddings()(input_ids)
|
| 414 |
-
|
|
|
|
| 415 |
|
| 416 |
sam_model = getattr(self, 'sam_model', None)
|
| 417 |
# sam_model = self.sam_model
|
| 418 |
qwen2_model = getattr(self, 'qwen2_model', None)
|
| 419 |
|
|
|
|
|
|
|
| 420 |
if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
|
| 421 |
|
| 422 |
idx = 0
|
|
@@ -449,10 +433,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
|
|
| 449 |
global_features = global_features_2
|
| 450 |
global_features = self.projector(global_features)
|
| 451 |
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
|
| 457 |
_, hw, n_dim = global_features.shape
|
| 458 |
# h = w = int(hw ** 0.5)
|
|
@@ -481,10 +465,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
|
|
| 481 |
global_features_2 = qwen2_model(global_features_1)
|
| 482 |
global_features = global_features_2
|
| 483 |
global_features = self.projector(global_features)
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
_, hw, n_dim = global_features.shape
|
| 489 |
# h = w = int(hw ** 0.5)
|
| 490 |
|
|
@@ -508,16 +492,10 @@ class DeepseekOCR2Model(DeepseekV2Model):
|
|
| 508 |
images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
|
| 509 |
# exit()
|
| 510 |
|
| 511 |
-
|
| 512 |
-
images_in_this_batch = images_in_this_batch.to(
|
| 513 |
-
device=inputs_embeds.device, dtype=inputs_embeds.dtype
|
| 514 |
-
)
|
| 515 |
-
mask = images_seq_mask[idx].unsqueeze(-1).to(inputs_embeds.device) # bool [T, 1]
|
| 516 |
-
updated_row = inputs_embeds[idx].masked_scatter(mask, images_in_this_batch)
|
| 517 |
-
inputs_embeds[idx] = updated_row
|
| 518 |
|
| 519 |
idx += 1
|
| 520 |
-
|
| 521 |
|
| 522 |
return super(DeepseekOCR2Model, self).forward(
|
| 523 |
input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
|
|
@@ -634,8 +612,8 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
|
|
| 634 |
if past_key_values is not None:
|
| 635 |
if isinstance(past_key_values, Cache):
|
| 636 |
cache_length = past_key_values.get_seq_length()
|
| 637 |
-
past_length = past_key_values.
|
| 638 |
-
max_cache_length =
|
| 639 |
else:
|
| 640 |
cache_length = past_length = past_key_values[0][0].shape[2]
|
| 641 |
max_cache_length = None
|
|
@@ -811,9 +789,9 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
|
|
| 811 |
|
| 812 |
|
| 813 |
|
| 814 |
-
images_list.append(image_transform(global_view).to(
|
| 815 |
|
| 816 |
-
# global_view_tensor = image_transform(global_view).to(
|
| 817 |
|
| 818 |
width_crop_num, height_crop_num = crop_ratio
|
| 819 |
|
|
@@ -824,7 +802,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
|
|
| 824 |
"""process the local views"""
|
| 825 |
|
| 826 |
for i in range(len(images_crop_raw)):
|
| 827 |
-
images_crop_list.append(image_transform(images_crop_raw[i]).to(
|
| 828 |
|
| 829 |
if image_size == 768:
|
| 830 |
valid_img_tokens += len(images_crop_list) * 144
|
|
@@ -858,7 +836,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
|
|
| 858 |
# else:
|
| 859 |
global_view = ImageOps.pad(image, (image_size, image_size),
|
| 860 |
color=tuple(int(x * 255) for x in image_transform.mean))
|
| 861 |
-
images_list.append(image_transform(global_view).to(
|
| 862 |
|
| 863 |
if base_size == 1024:
|
| 864 |
valid_img_tokens += int(256 * ratio)
|
|
@@ -925,7 +903,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
|
|
| 925 |
|
| 926 |
if not eval_mode:
|
| 927 |
streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
|
| 928 |
-
with torch.autocast("cuda", dtype=
|
| 929 |
with torch.no_grad():
|
| 930 |
output_ids = self.generate(
|
| 931 |
input_ids.unsqueeze(0).cuda(),
|
|
@@ -943,7 +921,7 @@ class DeepseekOCR2ForCausalLM(DeepseekV2ForCausalLM):
|
|
| 943 |
)
|
| 944 |
|
| 945 |
else:
|
| 946 |
-
with torch.autocast("cuda", dtype=
|
| 947 |
with torch.no_grad():
|
| 948 |
output_ids = self.generate(
|
| 949 |
input_ids.unsqueeze(0).cuda(),
|
|
|
|
| 1 |
+
from .modeling_deepseekv2 import DeepseekV2Model, DeepseekV2ForCausalLM
|
| 2 |
+
from .configuration_deepseek_v2 import DeepseekV2Config
|
| 3 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
|
|
|
|
|
|
| 4 |
from typing import List, Optional, Tuple, Union
|
| 5 |
+
from transformers.cache_utils import Cache
|
| 6 |
+
import requests
|
| 7 |
from PIL import Image, ImageOps, ImageDraw, ImageFont
|
| 8 |
+
from io import BytesIO
|
|
|
|
| 9 |
import torch
|
| 10 |
import torch.nn as nn
|
| 11 |
from torch.nn import CrossEntropyLoss
|
| 12 |
from torchvision import transforms
|
| 13 |
+
# from torchvision.transforms.functional import InterpolationMode
|
| 14 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
from .deepencoderv2 import build_sam_vit_b, build_qwen2_decoder_as_encoder, MlpProjector
|
| 16 |
+
from addict import Dict
|
| 17 |
+
from transformers import TextStreamer
|
| 18 |
from .conversation import get_conv_template
|
| 19 |
+
from abc import ABC
|
| 20 |
+
import math
|
| 21 |
+
import re
|
| 22 |
+
from tqdm import tqdm
|
| 23 |
+
import numpy as np
|
| 24 |
+
# import time
|
| 25 |
+
|
| 26 |
|
|
|
|
| 27 |
|
| 28 |
def load_image(image_path):
|
| 29 |
|
|
|
|
| 348 |
text = text.replace(eos_text, "\n")
|
| 349 |
print(text, flush=True, end="")
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
class DeepseekOCR2Config(DeepseekV2Config):
|
| 353 |
model_type = "DeepseekOCR2"
|
|
|
|
| 366 |
embed_std = 1 / torch.sqrt(torch.tensor(n_embed, dtype=torch.float32))
|
| 367 |
# self.image_newline = nn.Parameter(torch.randn(n_embed) * embed_std)
|
| 368 |
self.view_seperator = nn.Parameter(torch.randn(n_embed) * embed_std)
|
| 369 |
+
|
| 370 |
+
|
| 371 |
|
| 372 |
|
| 373 |
def forward(
|
|
|
|
| 386 |
return_dict: Optional[bool] = None,
|
| 387 |
) -> Union[Tuple, BaseModelOutputWithPast]:
|
| 388 |
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
|
| 392 |
if inputs_embeds is None:
|
| 393 |
# inputs_embeds = self.embed_tokens(input_ids)
|
| 394 |
inputs_embeds = self.get_input_embeddings()(input_ids)
|
| 395 |
+
|
| 396 |
+
|
| 397 |
|
| 398 |
sam_model = getattr(self, 'sam_model', None)
|
| 399 |
# sam_model = self.sam_model
|
| 400 |
qwen2_model = getattr(self, 'qwen2_model', None)
|
| 401 |
|
| 402 |
+
|
| 403 |
+
|
| 404 |
if sam_model is not None and (input_ids.shape[1] != 1 or self.training) and torch.sum(images[0][1]).item() != 0:
|
| 405 |
|
| 406 |
idx = 0
|
|
|
|
| 433 |
global_features = global_features_2
|
| 434 |
global_features = self.projector(global_features)
|
| 435 |
|
| 436 |
+
print('=====================')
|
| 437 |
+
print('BASE: ', global_features.shape)
|
| 438 |
+
print('PATCHES: ', local_features.shape)
|
| 439 |
+
print('=====================')
|
| 440 |
|
| 441 |
_, hw, n_dim = global_features.shape
|
| 442 |
# h = w = int(hw ** 0.5)
|
|
|
|
| 465 |
global_features_2 = qwen2_model(global_features_1)
|
| 466 |
global_features = global_features_2
|
| 467 |
global_features = self.projector(global_features)
|
| 468 |
+
print('=====================')
|
| 469 |
+
print('BASE: ', global_features.shape)
|
| 470 |
+
print('NO PATCHES')
|
| 471 |
+
print('=====================')
|
| 472 |
_, hw, n_dim = global_features.shape
|
| 473 |
# h = w = int(hw ** 0.5)
|
| 474 |
|
|
|
|
| 492 |
images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
|
| 493 |
# exit()
|
| 494 |
|
| 495 |
+
inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
|
| 497 |
idx += 1
|
| 498 |
+
|
| 499 |
|
| 500 |
return super(DeepseekOCR2Model, self).forward(
|
| 501 |
input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
|
|
|
|
| 612 |
if past_key_values is not None:
|
| 613 |
if isinstance(past_key_values, Cache):
|
| 614 |
cache_length = past_key_values.get_seq_length()
|
| 615 |
+
past_length = past_key_values.seen_tokens
|
| 616 |
+
max_cache_length = past_key_values.get_max_length()
|
| 617 |
else:
|
| 618 |
cache_length = past_length = past_key_values[0][0].shape[2]
|
| 619 |
max_cache_length = None
|
|
|
|
| 789 |
|
| 790 |
|
| 791 |
|
| 792 |
+
images_list.append(image_transform(global_view).to(torch.bfloat16))
|
| 793 |
|
| 794 |
+
# global_view_tensor = image_transform(global_view).to(torch.bfloat16)
|
| 795 |
|
| 796 |
width_crop_num, height_crop_num = crop_ratio
|
| 797 |
|
|
|
|
| 802 |
"""process the local views"""
|
| 803 |
|
| 804 |
for i in range(len(images_crop_raw)):
|
| 805 |
+
images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
|
| 806 |
|
| 807 |
if image_size == 768:
|
| 808 |
valid_img_tokens += len(images_crop_list) * 144
|
|
|
|
| 836 |
# else:
|
| 837 |
global_view = ImageOps.pad(image, (image_size, image_size),
|
| 838 |
color=tuple(int(x * 255) for x in image_transform.mean))
|
| 839 |
+
images_list.append(image_transform(global_view).to(torch.bfloat16))
|
| 840 |
|
| 841 |
if base_size == 1024:
|
| 842 |
valid_img_tokens += int(256 * ratio)
|
|
|
|
| 903 |
|
| 904 |
if not eval_mode:
|
| 905 |
streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
|
| 906 |
+
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 907 |
with torch.no_grad():
|
| 908 |
output_ids = self.generate(
|
| 909 |
input_ids.unsqueeze(0).cuda(),
|
|
|
|
| 921 |
)
|
| 922 |
|
| 923 |
else:
|
| 924 |
+
with torch.autocast("cuda", dtype=torch.bfloat16):
|
| 925 |
with torch.no_grad():
|
| 926 |
output_ids = self.generate(
|
| 927 |
input_ids.unsqueeze(0).cuda(),
|
modeling_deepseekv2.py
CHANGED
|
@@ -34,14 +34,10 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
| 34 |
from transformers.activations import ACT2FN
|
| 35 |
from transformers.cache_utils import Cache, DynamicCache
|
| 36 |
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
try:
|
| 42 |
-
from transformers.models.llama.modeling_llama import LlamaFlashAttention2
|
| 43 |
-
except:
|
| 44 |
-
LlamaFlashAttention2 = None
|
| 45 |
from transformers.modeling_outputs import (
|
| 46 |
BaseModelOutputWithPast,
|
| 47 |
CausalLMOutputWithPast,
|
|
|
|
| 34 |
from transformers.activations import ACT2FN
|
| 35 |
from transformers.cache_utils import Cache, DynamicCache
|
| 36 |
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
|
| 37 |
+
from transformers.models.llama.modeling_llama import (
|
| 38 |
+
LlamaAttention,
|
| 39 |
+
LlamaFlashAttention2
|
| 40 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
from transformers.modeling_outputs import (
|
| 42 |
BaseModelOutputWithPast,
|
| 43 |
CausalLMOutputWithPast,
|
special_tokens_map.json
CHANGED
|
@@ -1,7 +1,19 @@
|
|
| 1 |
{
|
| 2 |
"additional_special_tokens": [
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
],
|
| 6 |
"bos_token": {
|
| 7 |
"content": "<|begin▁of▁sentence|>",
|
|
|
|
| 1 |
{
|
| 2 |
"additional_special_tokens": [
|
| 3 |
+
{
|
| 4 |
+
"content": "<|User|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false
|
| 9 |
+
},
|
| 10 |
+
{
|
| 11 |
+
"content": "<|Assistant|>",
|
| 12 |
+
"lstrip": false,
|
| 13 |
+
"normalized": false,
|
| 14 |
+
"rstrip": false,
|
| 15 |
+
"single_word": false
|
| 16 |
+
}
|
| 17 |
],
|
| 18 |
"bos_token": {
|
| 19 |
"content": "<|begin▁of▁sentence|>",
|
tokenizer_config.json
CHANGED
|
@@ -6658,4 +6658,4 @@
|
|
| 6658 |
"tokenizer_class": "LlamaTokenizerFast",
|
| 6659 |
"unk_token": null,
|
| 6660 |
"use_default_system_prompt": false
|
| 6661 |
-
}
|
|
|
|
| 6658 |
"tokenizer_class": "LlamaTokenizerFast",
|
| 6659 |
"unk_token": null,
|
| 6660 |
"use_default_system_prompt": false
|
| 6661 |
+
}
|